Skip to content

Commit 7d1f65b

Browse files
committed
Preserve chromosome name in separate Scaffold attribute
1 parent 1593647 commit 7d1f65b

File tree

7 files changed

+81
-85
lines changed

7 files changed

+81
-85
lines changed

src/tola/assembly/assembly_stats.py

Lines changed: 12 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -151,25 +151,18 @@ def get_assembly_scaffold_lengths(
151151
return scaff_lengths
152152

153153
def chromosome_name_csv(self, haplotype: str, asm: Assembly):
154-
prefix = self.autosome_prefix
155-
suffix = f"_{haplotype}" if haplotype else None
156-
last_orig = None
157-
chr_name = None
158-
159154
csv_str = io.StringIO()
160155
for scffld in asm.scaffolds:
161156
if scffld.rank in (1, 2):
162-
name = scffld.name
163-
orig = scffld.original_name
164-
if last_orig and orig == last_orig:
165-
localised = "no"
166-
else:
167-
localised = "yes"
168-
chr_name = name.replace(prefix, "", 1)
169-
if suffix:
170-
chr_name = chr_name.replace(suffix, "", 1)
171-
last_orig = orig
172-
csv_str.write(",".join((name, chr_name, localised)))
157+
csv_str.write(
158+
",".join(
159+
(
160+
scffld.name,
161+
scffld.chr_name,
162+
"yes" if scffld.localised else "no",
163+
)
164+
)
165+
)
173166
csv_str.write("\n")
174167

175168
return csv_str.getvalue() if csv_str.tell() else None
@@ -192,30 +185,15 @@ def chromosomes_report_csv(self, hap_asm: AssemblyDict):
192185
)
193186
head_pos = csv_str.tell()
194187

195-
prefix = self.autosome_prefix
196-
chr_name = None
197-
last_orig = None
198188
for hap, asm in hap_asm.items():
199-
suffix = f"_{hap}" if hap else None
200189
for scffld in asm.scaffolds:
201190
if scffld.rank in (1, 2):
202-
name = scffld.name
203-
orig = scffld.original_name
204-
if last_orig and orig == last_orig:
205-
# Unlocs share the same original_name
206-
localised = "false"
207-
else:
208-
localised = "true"
209-
last_orig = orig
210-
chr_name = name.replace(prefix, "", 1)
211-
if suffix:
212-
chr_name = chr_name.replace(suffix, "", 1)
213191
csvr.writerow(
214192
(
215193
scffld.haplotype or hap or "Primary",
216-
name,
217-
chr_name,
218-
localised,
194+
scffld.name,
195+
scffld.chr_name,
196+
"true" if scffld.localised else "false",
219197
scffld.original_name,
220198
scffld.length,
221199
scffld.fragments_length,

src/tola/assembly/build_assembly.py

Lines changed: 19 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -333,10 +333,15 @@ def cut_scaffold_if_too_long(self, scffld: Scaffold) -> list[Scaffold]:
333333
cut_at = whole // div
334334
gap_i = self.index_of_nearest_gap_to_ideal_cut_site(to_cut, cut_at)
335335
rows = to_cut.rows
336+
336337
# First part is everything up to, but not including, the gap
337-
cut_parts.append(Scaffold(to_cut.name, rows[:gap_i]))
338+
cut = to_cut.clone_empty()
339+
cut.rows = rows[:gap_i]
340+
cut_parts.append(cut)
341+
338342
# Second part is everything after the gap
339-
to_cut = Scaffold(to_cut.name, rows[gap_i + 1 :])
343+
to_cut = to_cut.clone_empty()
344+
to_cut.rows = rows[gap_i + 1 :]
340345
cut_parts.append(to_cut)
341346

342347
# Add suffix "_1", "_2" etc... to cut scaffolds
@@ -393,6 +398,7 @@ def index_of_nearest_gap_to_ideal_cut_site(self, to_cut: Scaffold, cut_at: int):
393398
break
394399

395400
if gap_i_before is None and gap_i_after is None:
401+
### Make a cut here ###
396402
msg = (
397403
f"Failed to find gap before or after {cut_at:_d} in '{to_cut.name}'"
398404
)
@@ -403,47 +409,35 @@ def index_of_nearest_gap_to_ideal_cut_site(self, to_cut: Scaffold, cut_at: int):
403409
elif gap_i_after is None:
404410
ovr_i = gap_i_before
405411
else:
406-
length_before = 0
407-
length_after = 0
408-
for i, this_row in enumerate(rows):
409-
if i < gap_i_before:
410-
length_before += this_row.length
411-
412-
if i < gap_i_after:
413-
length_after += this_row.length
414-
else:
415-
break
412+
length_before = (
413+
idx_asm.start_end_of_row(to_cut.name, gap_i_before)[0] - 1
414+
)
415+
length_after = idx_asm.start_end_of_row(to_cut.name, gap_i_after)[0] - 1
416416

417417
# Choose the gap before or after, whichever is nearest to the
418418
# ideal cut point.
419419
ovr_i = (
420420
gap_i_before
421-
if abs(cut_at - length_before) < abs(cut_at - length_after)
421+
if abs(cut_at - length_before) < abs(length_after - cut_at)
422422
else gap_i_after
423423
)
424424

425425
return ovr_i
426426

427427
def scaffolds_fused_by_name(self) -> list[Scaffold]:
428428
gap = self.default_gap
429-
hap_name_scaffold: dict[tuple[str, str], Scaffold] = {}
429+
hap_name_scaffold: dict[tuple[str | None, str], Scaffold] = {}
430430
for scffld in self.scaffolds:
431431
if not scffld.rows:
432432
# discard_overhanging_fragments() may have removed the only
433433
# row from an OverlapResult
434434
continue
435435

436-
build_scffld = hap_name_scaffold.setdefault(
437-
(scffld.haplotype, scffld.name),
438-
Scaffold(
439-
scffld.name,
440-
tag=scffld.tag,
441-
haplotype=scffld.haplotype,
442-
rank=scffld.rank,
443-
original_name=scffld.original_name,
444-
original_tags=scffld.original_tags,
445-
),
446-
)
436+
idx = scffld.haplotype, scffld.name
437+
build_scffld = hap_name_scaffold.get(idx)
438+
if not build_scffld:
439+
hap_name_scaffold[idx] = build_scffld = scffld.clone_empty()
440+
447441
if isinstance(scffld, OverlapResult):
448442
build_scffld.append_scaffold(scffld.to_scaffold(), gap)
449443
else:

src/tola/assembly/indexed_assembly.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ def overlap_index_by_name(self, name: str) -> list[int]:
5151
msg = f"Scaffold '{name}' is not indexed."
5252
raise ValueError(msg)
5353

54+
def start_end_of_row(self, name: str, index: int) -> tuple[int, int]:
55+
idx = self.overlap_index_by_name(name)
56+
start = 1 if index == 0 else 1 + idx[index - 1]
57+
return start, idx[index]
58+
5459
def find_overlaps(self, bait: Fragment) -> OverlapResult | None:
5560
"""
5661
Given a Fragment bait, returns an OverlapResult (a subclass of

src/tola/assembly/naming_utils.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,12 @@ def label_scaffold(
150150
if "Painted" not in scaffold_tags:
151151
msg = f"Unloc in unpainted scaffold {original_name!r}: {fragment}"
152152
raise TaggingError(msg)
153+
scaffold.chr_name = name
153154
name = self.unloc_name()
154155
self.unloc_scaffolds[-1].append(scaffold)
156+
elif "Painted" in scaffold_tags:
157+
scaffold.localised = True
158+
scaffold.chr_name = name
155159

156160
scaffold.name = name
157161
scaffold.haplotype = self.current_haplotype
@@ -231,17 +235,17 @@ def length_of_first_haplotype(self):
231235
return length
232236

233237
@staticmethod
234-
def multi_chr_list(chr_name, multi_count, hap_suffix):
238+
def multi_chr_list(chr_name, multi_count):
235239
"""
236240
Adds the suffix "A", "B", "C" etc... to the supplied chromosome name
237241
for when there are multiple chromosomes in a group.
238242
"""
239243
if multi_count == 1:
240-
return [chr_name + hap_suffix]
244+
return [chr_name]
241245
else:
242246
chr_list = []
243247
for ltr in range(ord("A"), ord("A") + multi_count):
244-
chr_list.append(chr_name + chr(ltr) + hap_suffix)
248+
chr_list.append(chr_name + chr(ltr))
245249
return chr_list
246250

247251
def max_hap_set_count(self):
@@ -262,13 +266,15 @@ def name_chromosome(self, chr_prefix, chr_n):
262266
hap_suffix = (
263267
"" if (hap_name is None or hap_name == "Primary") else f"_{hap_name}"
264268
)
265-
chr_names = self.multi_chr_list(
266-
chr_prefix + str(chr_n), len(hap_set), hap_suffix
267-
)
269+
chr_names = self.multi_chr_list(str(chr_n), len(hap_set))
268270
for orig, scffld_list in hap_set.items():
269271
this_chr = chr_names.pop(0)
270272
for scffld in scffld_list:
271-
scffld.name = scffld.name.replace(orig, this_chr)
273+
scffld.name = scffld.name.replace(
274+
orig,
275+
chr_prefix + this_chr + hap_suffix,
276+
)
277+
scffld.chr_name = this_chr
272278

273279

274280
class ChrNamerError(Exception):
@@ -293,7 +299,7 @@ def __init__(self, chr_prefix="SUPER_"):
293299
self.chr_prefix = chr_prefix
294300
self.scaffolds = []
295301
self.haplotypes_seen = {}
296-
self.groups = None
302+
self.groups: list[ChrGroup] | None = None
297303

298304
def add_scaffold(self, haplotype, scffld):
299305
# A dict is used to store the haplotypes seen since order is

src/tola/assembly/overlap_result.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,19 @@ class OverlapResult(Scaffold):
99
def __init__(
1010
self,
1111
bait,
12-
rows,
1312
start,
1413
end,
14+
rows,
1515
name=None,
16-
tag=None,
17-
haplotype=None,
18-
rank=None,
19-
original_name=None,
16+
**args,
2017
):
2118
if not name:
2219
name = f"matches to {bait.name} {bait.start} to {bait.end}"
23-
super().__init__(name, rows, tag, haplotype, rank, original_name)
20+
super().__init__(
21+
name,
22+
rows,
23+
**args,
24+
)
2425
self.bait = bait
2526
self.start = start
2627
self.end = end
@@ -174,12 +175,8 @@ def append_scaffold(self):
174175
raise NotImplementedError
175176

176177
def to_scaffold(self) -> Scaffold:
177-
scffld = Scaffold(
178-
self.name,
179-
self.rows,
180-
original_name=self.original_name,
181-
original_tags=self.original_tags,
182-
)
178+
scffld = self.clone_empty()
179+
scffld.rows = self.rows
183180
if self.bait.strand == -1:
184181
return scffld.reverse()
185182
else:

src/tola/assembly/scaffold.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ def __init__(
1616
rank=0,
1717
original_name=None,
1818
original_tags: set[str] | None = None,
19+
localised: bool = False,
20+
chr_name: str | None = None,
1921
):
2022
self.name = str(name)
2123
if rows:
@@ -27,6 +29,24 @@ def __init__(
2729
self.rank: int = rank
2830
self.original_name: str | None = original_name
2931
self.original_tags: set[str] | None = original_tags
32+
self.localised = localised
33+
self.chr_name = chr_name
34+
35+
def clone_empty(self) -> 'Scaffold':
36+
"""
37+
Clones a `Scaffold` or derived class, returning a base `Scaffold` with
38+
all of its attributes apart from `rows`.
39+
"""
40+
return Scaffold(
41+
self.name,
42+
tag=self.tag,
43+
haplotype=self.haplotype,
44+
rank=self.rank,
45+
original_name=self.original_name,
46+
original_tags=self.original_tags,
47+
localised=self.localised,
48+
chr_name=self.chr_name,
49+
)
3050

3151
def __repr__(self):
3252
txt = io.StringIO()

tests/build_assembly_test.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,8 @@
1414

1515

1616
def test_multi_chr_list():
17-
assert ChrGroup.multi_chr_list("SUPER_3", 1, "_HAP2") == ["SUPER_3_HAP2"]
18-
assert ChrGroup.multi_chr_list("SUPER_3", 3, "_HAP1") == [
19-
"SUPER_3A_HAP1",
20-
"SUPER_3B_HAP1",
21-
"SUPER_3C_HAP1",
22-
]
17+
assert ChrGroup.multi_chr_list("3", 1) == ["3"]
18+
assert ChrGroup.multi_chr_list("3", 3) == ["3A", "3B", "3C"]
2319

2420

2521
def list_chr_naming_tests():

0 commit comments

Comments
 (0)