Agentic-Spatial-Pathologist/main.py at main · hutaobo/Agentic-Spatial-Pathologist · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from __future__ import annotations

import json
import os
import shutil
import time
import traceback
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
import uuid


def bootstrap_runtime_env() -> None:
    """Point caches to writable paths before importing HistoSeg/matplotlib."""
    os.environ.setdefault("HOME", "/tmp")
    os.environ.setdefault("XDG_CACHE_HOME", "/tmp/.cache")
    os.environ.setdefault("MPLCONFIGDIR", "/tmp/matplotlib")
    os.environ.setdefault("MPLBACKEND", "Agg")
    os.environ.setdefault("GRADIO_TEMP_DIR", "/tmp/gradio")

    for key in ("HOME", "XDG_CACHE_HOME", "MPLCONFIGDIR", "GRADIO_TEMP_DIR"):
        Path(os.environ[key]).mkdir(parents=True, exist_ok=True)


bootstrap_runtime_env()

import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skimage import measure
from scipy.cluster.hierarchy import dendrogram, fcluster, leaves_list, linkage, to_tree
from scipy.ndimage import (
    binary_closing,
    binary_fill_holes,
    binary_opening,
    distance_transform_edt,
    gaussian_filter,
    generate_binary_structure,
    label as nd_label,
)
from scipy.spatial.distance import squareform

try:
    import pyarrow.parquet as pq
except Exception:  # pragma: no cover - optional runtime helper
    pq = None

try:
    from histoseg import Pattern1IsolineConfig
    from histoseg.contours.pattern1_isoline import (
        Pattern1IsolineResult,
        _normalize_cluster_label,
        _validate_label_scheme,
        align_clusters_with_cells,
        extract_contour_paths,
    )
    from histoseg.sfplot.Searcher_Findee_Score import (
        compute_cophenetic_from_distance_matrix,
        compute_searcher_findee_distance_matrix_from_df,
        plot_cophenetic_heatmap,
    )

    HISTOSEG_IMPORT_ERROR = None
except Exception as exc:  # pragma: no cover - startup fallback only
    Pattern1IsolineConfig = None  # type: ignore[assignment]
    Pattern1IsolineResult = None  # type: ignore[assignment]
    _normalize_cluster_label = None  # type: ignore[assignment]
    _validate_label_scheme = None  # type: ignore[assignment]
    align_clusters_with_cells = None  # type: ignore[assignment]
    extract_contour_paths = None  # type: ignore[assignment]
    compute_cophenetic_from_distance_matrix = None  # type: ignore[assignment]
    compute_searcher_findee_distance_matrix_from_df = None  # type: ignore[assignment]
    plot_cophenetic_heatmap = None  # type: ignore[assignment]
    HISTOSEG_IMPORT_ERROR = str(exc)


APP_NAME = "Agentic Spatial Pathologist"
APP_DESCRIPTION = (
    "A dendrogram-guided Xenium analysis workspace that turns related clusters into interpretable "
    "spatial structures before running the final HistoSeg contour analysis."
)
DEFAULT_PATTERN1 = "10,23,19,27,14,20,25,26"
GROUP_SELECTION_EMPTY = (
    "No structures selected yet. Use the checklist below, or type one structure per line manually."
)
SELECTION_NOTES_TEXT = (
    "Choose one or more structures in the checklist, or type cluster IDs manually below. "
    "Nothing is rerun while you are choosing. The app reads your final selection only when you click "
    "'Run multi-structure contour analysis'. If the text box is non-empty, the manual lines take priority."
)


def structure_selection_help_text() -> str:
    return f"{GROUP_SELECTION_EMPTY}\n\n{SELECTION_NOTES_TEXT}"
XENIUM_PIXEL_SIZE_UM = 0.2125
GROUP_PALETTE = [
    "#6EF0D4",
    "#78B9FF",
    "#FFB870",
    "#C8A2FF",
    "#FF8DA1",
    "#90F184",
    "#FFD76C",
    "#80E1FF",
    "#F4A6FF",
    "#FFA07A",
    "#B8F0DE",
    "#A7BFFF",
]
DEFAULT_STRUCTURE_ISOLINE_CFG = {
    "bins_x": 900,
    "bins_y": 700,
    "gaussian_sigma": 2.25,
    "density_scale_quantile": 0.98,
    "support_quantile": 0.18,
    "tissue_quantile": 0.06,
    "min_dominance": 0.34,
    "closing_iterations": 2,
    "opening_iterations": 1,
    "fill_holes": True,
    "min_cells": 500,
    "min_component_pixels": 180,
}
PREFERRED_WORK_DIR = Path(os.environ.get("APP_DATA_DIR", "./project-vol")).resolve()
FALLBACK_WORK_DIR = Path("/tmp/project-vol")


@dataclass(frozen=True)
class RuntimeProfile:
    grid_n: int
    bg_max_points: int
    syn_bg_density: float
    syn_bg_min: int
    syn_bg_max: int
    scale_label: str
    notes: tuple[str, ...]


def resolve_work_dir() -> Path:
    for candidate in (PREFERRED_WORK_DIR, FALLBACK_WORK_DIR):
        try:
            candidate.mkdir(parents=True, exist_ok=True)
            probe = candidate / ".write_test"
            probe.write_text("ok", encoding="utf-8")
            probe.unlink()
            return candidate
        except OSError:
            continue
    raise PermissionError(
        f"Could not find a writable work directory. Tried: {PREFERRED_WORK_DIR} and {FALLBACK_WORK_DIR}"
    )


DEFAULT_WORK_DIR = resolve_work_dir()
RUNS_DIR = DEFAULT_WORK_DIR / "runs"
SELECTIONS_DIR = DEFAULT_WORK_DIR / "structure-selections"


def ensure_workdirs() -> None:
    RUNS_DIR.mkdir(parents=True, exist_ok=True)
    SELECTIONS_DIR.mkdir(parents=True, exist_ok=True)


def log_event(message: str) -> None:
    stamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{stamp}] {message}", flush=True)


def to_internal_label_scheme(label_scheme: str) -> str:
    if label_scheme is None:
        return "p1_is_one"
    return _validate_label_scheme(label_scheme)


def describe_label_scheme(label_scheme: str) -> str:
    internal = to_internal_label_scheme(label_scheme)
    if internal == "p1_is_one":
        return "Selected structures are treated as the signal of interest"
    return "Selected structures are treated as background"


def parse_pattern1_clusters(raw: str) -> list[int | str]:
    values: list[int | str] = []
    for item in raw.split(","):
        token = item.strip()
        if not token:
            continue
        if token.lstrip("-").isdigit():
            values.append(int(token))
        else:
            values.append(token)
    if not values:
        raise ValueError("Clusters to outline cannot be empty.")
    return values


def parse_optional_clusters(raw: str) -> list[int | str]:
    if raw is None:
        return []
    if not str(raw).strip():
        return []
    return parse_pattern1_clusters(str(raw))


def stringify_clusters(clusters: list[int | str]) -> str:
    return ",".join(str(item) for item in clusters)


def parse_structure_cluster_groups(raw: str) -> list[list[int | str]]:
    if raw is None or not str(raw).strip():
        raise ValueError("Please select one or more structures, or type cluster IDs with one structure per line.")

    groups: list[list[int | str]] = []
    normalized_text = str(raw).replace(";", "\n")
    for raw_line in normalized_text.splitlines():
        line = raw_line.strip()
        if not line:
            continue
        if ":" in line:
            line = line.split(":", 1)[1].strip()
        parsed = parse_pattern1_clusters(line)
        if parsed:
            groups.append(parsed)

    if not groups:
        raise ValueError("No valid structure groups were found. Use one line per structure, for example '10,23,19'.")
    return groups


def stringify_structure_cluster_groups(cluster_groups: list[list[int | str]]) -> str:
    return "\n".join(stringify_clusters(group) for group in cluster_groups)


def summarize_clusters(clusters: list[str], max_items: int = 8) -> str:
    if len(clusters) <= max_items:
        return ", ".join(clusters)
    head = ", ".join(clusters[:max_items])
    return f"{head}, ... (+{len(clusters) - max_items} more)"


def safe_count_parquet_rows(parquet_path: Path) -> int | None:
    if pq is None:
        return None
    try:
        return int(pq.ParquetFile(parquet_path).metadata.num_rows)
    except Exception:
        return None


def safe_count_csv_rows(csv_path: Path) -> int | None:
    try:
        with csv_path.open("r", encoding="utf-8", errors="ignore") as handle:
            count = sum(1 for _ in handle) - 1
        return max(count, 0)
    except Exception:
        return None


def choose_runtime_profile(
    *,
    requested_grid_n: int,
    requested_syn_bg_density: float,
    use_synth_bg: bool,
    estimated_rows: int | None,
) -> RuntimeProfile:
    effective_grid_n = int(requested_grid_n)
    bg_max_points = 60000
    syn_bg_density = float(requested_syn_bg_density)
    syn_bg_min = 20000
    syn_bg_max = 120000
    notes: list[str] = []

    ref_rows = estimated_rows or 0
    if ref_rows >= 80000:
        scale_label = "large"
        effective_grid_n = min(effective_grid_n, 450)
        bg_max_points = 12000
        syn_bg_density = min(syn_bg_density, 0.0015)
        syn_bg_min = 4000
        syn_bg_max = 12000
    elif ref_rows >= 40000:
        scale_label = "medium-large"
        effective_grid_n = min(effective_grid_n, 550)
        bg_max_points = 18000
        syn_bg_density = min(syn_bg_density, 0.0025)
        syn_bg_min = 5000
        syn_bg_max = 18000
    elif ref_rows >= 20000:
        scale_label = "medium"
        effective_grid_n = min(effective_grid_n, 650)
        bg_max_points = 25000
        syn_bg_density = min(syn_bg_density, 0.0035)
        syn_bg_min = 8000
        syn_bg_max = 25000
    elif ref_rows >= 10000:
        scale_label = "small-medium"
        effective_grid_n = min(effective_grid_n, 800)
        bg_max_points = 35000
        syn_bg_density = min(syn_bg_density, 0.005)
        syn_bg_min = 12000
        syn_bg_max = 35000
    else:
        scale_label = "small"

    if effective_grid_n != int(requested_grid_n):
        notes.append(
            f"Auto-reduced grid_n from {requested_grid_n} to {effective_grid_n} for Serve runtime stability."
        )
    if use_synth_bg and syn_bg_density != float(requested_syn_bg_density):
        notes.append(
            f"Auto-reduced synthetic background density from {requested_syn_bg_density:.4f} to {syn_bg_density:.4f}."
        )

    return RuntimeProfile(
        grid_n=effective_grid_n,
        bg_max_points=bg_max_points,
        syn_bg_density=syn_bg_density,
        syn_bg_min=syn_bg_min,
        syn_bg_max=syn_bg_max,
        scale_label=scale_label,
        notes=tuple(notes),
    )


def stage_uploaded_file(uploaded: object | None, target_dir: Path, explicit_name: str | None = None) -> Path | None:
    if uploaded is None:
        return None
    source = Path(str(uploaded))
    if not source.exists():
        raise FileNotFoundError(f"Uploaded file not found: {source}")
    filename = explicit_name or source.name
    destination = target_dir / filename
    shutil.copy2(source, destination)
    return destination


def resolve_inputs(
    *,
    cells_upload: object | None,
    clusters_upload: object | None,
    tissue_upload: object | None,
    target_dir: Path,
) -> tuple[Path, Path, Path | None]:
    cells_path = stage_uploaded_file(cells_upload, target_dir)
    clusters_path = stage_uploaded_file(clusters_upload, target_dir)
    tissue_path = stage_uploaded_file(tissue_upload, target_dir)

    if cells_path is None:
        raise ValueError("Missing cells.parquet. Please upload the cell coordinate file.")
    if clusters_path is None:
        raise ValueError("Missing clusters.csv. Please upload the cluster assignment file.")

    return cells_path, clusters_path, tissue_path


def build_run_dir() -> Path:
    ensure_workdirs()
    stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    run_dir = RUNS_DIR / f"run-{stamp}"
    suffix = 1
    while run_dir.exists():
        suffix += 1
        run_dir = RUNS_DIR / f"run-{stamp}-{suffix}"
    run_dir.mkdir(parents=True, exist_ok=False)
    return run_dir


def build_selection_dir() -> Path:
    ensure_workdirs()
    stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    selection_dir = SELECTIONS_DIR / f"selection-{stamp}"
    suffix = 1
    while selection_dir.exists():
        suffix += 1
        selection_dir = SELECTIONS_DIR / f"selection-{stamp}-{suffix}"
    selection_dir.mkdir(parents=True, exist_ok=False)
    return selection_dir


def cleanup_old_runs(max_keep: int = 2) -> list[str]:
    ensure_workdirs()
    runs = sorted(
        [path for path in RUNS_DIR.glob("run-*") if path.is_dir()],
        key=lambda path: path.stat().st_mtime,
        reverse=True,
    )
    removed: list[str] = []
    for stale in runs[max_keep:]:
        try:
            shutil.rmtree(stale)
            removed.append(stale.name)
        except OSError:
            continue
    return removed


def cleanup_old_selections(max_keep: int = 2) -> list[str]:
    ensure_workdirs()
    selections = sorted(
        [path for path in SELECTIONS_DIR.glob("selection-*") if path.is_dir()],
        key=lambda path: path.stat().st_mtime,
        reverse=True,
    )
    removed: list[str] = []
    for stale in selections[max_keep:]:
        try:
            shutil.rmtree(stale)
            removed.append(stale.name)
        except OSError:
            continue
    return removed


def directory_size_bytes(root: Path) -> int:
    total = 0
    for path in root.rglob("*"):
        if path.is_file():
            try:
                total += path.stat().st_size
            except OSError:
                continue
    return total


def zip_outputs(output_dir: Path, archive_dir: Path | None = None) -> tuple[Path | None, str | None]:
    target_dir = archive_dir if archive_dir is not None else output_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    archive_base = target_dir / "histoseg_outputs"
    archive_path = Path(f"{archive_base}.zip")
    output_bytes = directory_size_bytes(output_dir)
    free_bytes = shutil.disk_usage(target_dir).free

    # Creating a zip duplicates the output payload temporarily, so we keep a safety margin.
    required_free = max(output_bytes * 2, 256 * 1024 * 1024)
    if free_bytes < required_free:
        return None, (
            "Skipped zip archive because disk space is low on the Serve instance. "
            "The raw output files are still available below."
        )

    try:
        archive_path_str = shutil.make_archive(str(archive_base), "zip", root_dir=output_dir)
        return Path(archive_path_str), None
    except OSError as exc:
        try:
            archive_path.unlink(missing_ok=True)
        except OSError:
            pass
        if getattr(exc, "errno", None) == 28:
            return None, (
                "Skipped zip archive because the Serve instance ran out of disk space. "
                "The raw output files are still available below."
            )
        raise


def prepare_merged_clusters(cells_path: Path, clusters_path: Path) -> tuple[pd.DataFrame, str, str, str]:
    merged, id_col_used, x_col, y_col = align_clusters_with_cells(
        clusters_path,
        cells_path,
        barcode_col="Barcode",
        cluster_col="Cluster",
    )
    merged = merged.copy()
    merged["cluster"] = merged["cluster"].map(_normalize_cluster_label)
    merged = merged.loc[merged["cluster"] != ""].copy()
    return merged, id_col_used, x_col, y_col


def normalize_row_cophenetic(row_coph: pd.DataFrame) -> pd.DataFrame:
    labels = [_normalize_cluster_label(label) for label in row_coph.index]
    normalized = row_coph.copy()
    normalized.index = labels
    normalized.columns = labels
    return normalized


def remap_flat_clusters_by_leaf_order(cluster_ids: list[str], linkage_matrix, flat_labels) -> dict[str, int]:
    if linkage_matrix is None:
        return {cluster_ids[0]: 1}

    raw_map = {str(cluster_id): int(raw_label) for cluster_id, raw_label in zip(cluster_ids, flat_labels)}
    ordered_cluster_ids = [cluster_ids[index] for index in leaves_list(linkage_matrix)]

    raw_order: list[int] = []
    seen_raw: set[int] = set()
    for cluster_id in ordered_cluster_ids:
        raw_label = raw_map[str(cluster_id)]
        if raw_label in seen_raw:
            continue
        seen_raw.add(raw_label)
        raw_order.append(raw_label)

    remap = {raw_label: index + 1 for index, raw_label in enumerate(raw_order)}
    return {str(cluster_id): int(remap[raw_map[str(cluster_id)]]) for cluster_id in cluster_ids}


def group_color(group_id: int) -> str:
    return GROUP_PALETTE[(int(group_id) - 1) % len(GROUP_PALETTE)]


def build_structure_choice_label(group_id: int, clusters: list[str]) -> str:
    return f"Structure {group_id} | {len(clusters)} cluster IDs"


def build_group_table(
    group_state: dict[str, object] | None,
    selected_groups: list[str] | None,
) -> pd.DataFrame:
    rows: list[dict[str, object]] = []
    if not group_state:
        return pd.DataFrame(rows, columns=["Structure", "Cluster count", "Cluster IDs"])

    for record in group_state.get("group_records", []):
        rows.append(
            {
                "Structure": str(record["group_name"]),
                "Cluster count": int(record["cluster_count"]),
                "Cluster IDs": ", ".join(str(item) for item in record["clusters"]),
            }
        )
    return pd.DataFrame(rows, columns=["Structure", "Cluster count", "Cluster IDs"])


def build_structure_group_state(
    row_coph: pd.DataFrame,
    *,
    n_groups: int,
    linkage_method: str = "average",
) -> dict[str, object]:
    cluster_ids = [str(value) for value in row_coph.index]
    if not cluster_ids:
        raise ValueError("No clusters were available for dendrogram building.")

    cluster_to_leaf_index = {cluster_id: index for index, cluster_id in enumerate(cluster_ids)}
    if len(cluster_ids) == 1:
        ordered_clusters = cluster_ids
        group_to_clusters = {1: ordered_clusters}
        linkage_matrix = None
        leaf_positions = {0: 5.0}
        node_leaf_map = {0: [0]}
    else:
        condensed = squareform(row_coph.values, checks=False)
        linkage_matrix = linkage(condensed, method=linkage_method)
        n_groups = max(1, min(int(n_groups), len(cluster_ids)))
        flat_labels = fcluster(linkage_matrix, t=n_groups, criterion="maxclust")
        cluster_to_group = remap_flat_clusters_by_leaf_order(cluster_ids, linkage_matrix, flat_labels)
        leaf_order = [int(index) for index in leaves_list(linkage_matrix)]
        ordered_clusters = [cluster_ids[index] for index in leaf_order]
        group_to_clusters: dict[int, list[str]] = {}
        for cluster_id in ordered_clusters:
            group_id = int(cluster_to_group[cluster_id])
            group_to_clusters.setdefault(group_id, []).append(cluster_id)
        leaf_positions = {leaf_id: 5.0 + 10.0 * order_index for order_index, leaf_id in enumerate(leaf_order)}

        root_node, node_list = to_tree(linkage_matrix, rd=True)
        node_leaf_map: dict[int, list[int]] = {}

        def collect_leaf_ids(node) -> list[int]:
            if node.is_leaf():
                leaves = [int(node.id)]
            else:
                leaves = collect_leaf_ids(node.left) + collect_leaf_ids(node.right)
            node_leaf_map[int(node.id)] = leaves
            return leaves

        collect_leaf_ids(root_node)
        _ = node_list  # Keeps the rd=True unpacking explicit for readability.

    leaf_set_to_node: dict[frozenset[int], dict[str, float]] = {}
    if linkage_matrix is None:
        leaf_set_to_node[frozenset({0})] = {"node_id": 0.0, "dist": 0.0}
    else:
        root_node, node_list = to_tree(linkage_matrix, rd=True)
        for node in node_list:
            leaves = node_leaf_map.get(int(node.id), [])
            leaf_set_to_node[frozenset(int(value) for value in leaves)] = {
                "node_id": float(node.id),
                "dist": float(node.dist),
            }

    ordered_cluster_to_position = {cluster_id: index for index, cluster_id in enumerate(ordered_clusters)}

    choices: list[str] = []
    choice_to_clusters: dict[str, list[str]] = {}
    table_rows: list[dict[str, object]] = []
    group_records: list[dict[str, Any]] = []
    for group_id, clusters in sorted(group_to_clusters.items()):
        choice_label = build_structure_choice_label(group_id, clusters)
        leaf_ids = [int(cluster_to_leaf_index[cluster_id]) for cluster_id in clusters]
        x_points = [float(leaf_positions[leaf_id]) for leaf_id in leaf_ids]
        leaf_span = sorted(int(ordered_cluster_to_position[cluster_id]) for cluster_id in clusters)
        node_summary = leaf_set_to_node.get(frozenset(leaf_ids), {"node_id": float(group_id), "dist": 0.0})
        span_left = min(x_points) - 5.0
        span_right = max(x_points) + 5.0
        y_data = float(node_summary["dist"])
        marker_y = y_data
        color = group_color(group_id)

        choices.append(choice_label)
        choice_to_clusters[choice_label] = list(clusters)
        table_rows.append(
            {
                "Selected": "",
                "Structure": f"Structure {group_id}",
                "Cluster count": len(clusters),
                "Cluster IDs": ", ".join(clusters),
            }
        )
        group_records.append(
            {
                "group_id": int(group_id),
                "group_name": f"Structure {group_id}",
                "choice_label": choice_label,
                "clusters": list(clusters),
                "cluster_count": int(len(clusters)),
                "color": color,
                "leaf_start": int(min(leaf_span)),
                "leaf_end": int(max(leaf_span)),
                "span_left": float(span_left),
                "span_right": float(span_right),
                "x_data": float(np.mean(x_points)),
                "y_data": float(y_data),
                "marker_y": float(marker_y),
            }
        )

    return {
        "n_groups": len(group_to_clusters),
        "ordered_clusters": ordered_clusters,
        "choices": choices,
        "choice_to_clusters": choice_to_clusters,
        "table_rows": table_rows,
        "group_records": group_records,
        "row_coph_labels": cluster_ids,
        "row_coph_values": row_coph.to_numpy().tolist(),
        "linkage_matrix": linkage_matrix.tolist() if linkage_matrix is not None else None,
        "selected_groups": [],
    }


def collect_clusters_from_groups(selected_groups: list[str], group_state: dict[str, object] | None) -> list[str]:
    if not group_state:
        return []
    choice_to_clusters = group_state.get("choice_to_clusters", {})
    ordered_clusters = group_state.get("ordered_clusters", [])
    selected_set = set(selected_groups or [])
    cluster_order: list[str] = []
    for cluster_id in ordered_clusters:
        for choice, clusters in choice_to_clusters.items():
            if choice in selected_set and cluster_id in clusters and cluster_id not in cluster_order:
                cluster_order.append(cluster_id)
    return cluster_order


def update_clusters_to_outline_from_groups(
    selected_groups: list[str] | None,
    group_state: dict[str, object] | None,
) -> tuple[str, str]:
    normalized_groups = normalize_selected_groups(selected_groups or [], group_state)
    if not normalized_groups:
        return "", GROUP_SELECTION_EMPTY

    choice_to_clusters = (group_state or {}).get("choice_to_clusters", {})
    grouped_clusters = [list(choice_to_clusters.get(choice, [])) for choice in normalized_groups]
    cluster_text = stringify_structure_cluster_groups(grouped_clusters)
    summary_lines = [f"Selected {len(normalized_groups)} structure(s)."]
    for idx, clusters in enumerate(grouped_clusters, start=1):
        summary_lines.append(f"Structure {idx}: {summarize_clusters([str(item) for item in clusters])}")
    summary = "\n".join(summary_lines)
    return cluster_text, summary


def _normalize_non_empty_lines(raw_text: str | None) -> list[str]:
    if raw_text is None:
        return []
    lines: list[str] = []
    for raw_line in str(raw_text).replace(";", "\n").splitlines():
        line = raw_line.strip()
        if line:
            lines.append(line)
    return lines


def sync_selected_groups_to_text(
    selected_groups: list[str] | None,
    current_text: str | None,
    previous_auto_lines: list[str] | None,
    group_state: dict[str, object] | None,
) -> tuple[str, str, list[str]]:
    normalized_groups = normalize_selected_groups(selected_groups or [], group_state)
    auto_cluster_text, auto_summary = update_clusters_to_outline_from_groups(normalized_groups, group_state)
    next_auto_lines = _normalize_non_empty_lines(auto_cluster_text)
    previous_auto_set = set(_normalize_non_empty_lines("\n".join(previous_auto_lines or [])))

    manual_lines = [line for line in _normalize_non_empty_lines(current_text) if line not in previous_auto_set]
    merged_lines = list(manual_lines)
    for line in next_auto_lines:
        if line not in merged_lines:
            merged_lines.append(line)

    if normalized_groups:
        summary_lines = [auto_summary]
        if manual_lines:
            summary_lines.append(
                f"Manual lines kept in the text box: {len(manual_lines)}. You can still edit any line before Run."
            )
        else:
            summary_lines.append("Selected structures were copied into the text box below. You can edit them before Run.")
        summary = "\n\n".join(summary_lines)
    elif manual_lines:
        summary = (
            "No checklist structures selected right now.\n\n"
            f"Manual lines still present in the text box: {len(manual_lines)}.\n"
            "Those manual lines will be used if you click Run."
        )
    else:
        summary = structure_selection_help_text()

    return "\n".join(merged_lines), summary, next_auto_lines


def normalize_selected_groups(
    selected_groups: list[str] | None,
    group_state: dict[str, object] | None,
) -> list[str]:
    if not group_state:
        return []
    selected_set = set(selected_groups or [])
    return [choice for choice in group_state.get("choices", []) if choice in selected_set]


def render_structure_selector_image(
    group_state: dict[str, object],
    selected_groups: list[str] | None,
) -> tuple[Path, dict[str, object]]:
    output_dir = Path(str(group_state["selector_output_dir"]))
    output_dir.mkdir(parents=True, exist_ok=True)

    selected_groups = normalize_selected_groups(selected_groups, group_state)
    selected_set = set(selected_groups)
    group_records = [dict(record) for record in group_state.get("group_records", [])]
    row_coph = pd.DataFrame(
        np.asarray(group_state["row_coph_values"], dtype=float),
        index=list(group_state["row_coph_labels"]),
        columns=list(group_state["row_coph_labels"]),
    )
    ordered_clusters = list(group_state["ordered_clusters"])
    linkage_payload = group_state.get("linkage_matrix")
    linkage_matrix = np.asarray(linkage_payload, dtype=float) if linkage_payload is not None else None

    selector_key = "none"
    if selected_set:
        selected_ids = [str(record["group_id"]) for record in group_records if record["choice_label"] in selected_set]
        selector_key = "_".join(selected_ids)
    selector_path = output_dir / f"interactive_structure_selector_{selector_key}.png"

    fig, ax_dendro = plt.subplots(figsize=(13.5, 5.8), facecolor="#07111D")
    ax_dendro.set_facecolor("#0C1726")

    if linkage_matrix is not None:
        dendrogram(
            linkage_matrix,
            no_labels=True,
            color_threshold=0,
            above_threshold_color="#6B8198",
            link_color_func=lambda _node_id: "#6B8198",
            ax=ax_dendro,
        )
        max_dist = float(np.max(linkage_matrix[:, 2])) if len(linkage_matrix) else 1.0
    else:
        ax_dendro.plot([5.0, 5.0], [0.0, 1.0], color="#6B8198", linewidth=2.5)
        max_dist = 1.0

    marker_offset = max(max_dist * 0.08, 0.24)
    marker_positions: dict[str, dict[str, float]] = {}
    marker_size = 310

    for record in group_records:
        is_selected = record["choice_label"] in selected_set
        color = str(record["color"])
        x_data = float(record["x_data"])
        y_data = float(record["y_data"])
        marker_y = y_data + marker_offset
        record["marker_y"] = marker_y

        ax_dendro.axvspan(
            float(record["span_left"]),
            float(record["span_right"]),
            color=color,
            alpha=0.22 if is_selected else 0.06,
            zorder=0,
        )
        ax_dendro.plot(
            [x_data, x_data],
            [y_data, marker_y - marker_offset * 0.18],
            color=color,
            linewidth=2.1 if is_selected else 1.3,
            alpha=0.95,
            zorder=4,
        )
        ax_dendro.scatter(
            [x_data],
            [marker_y],
            s=marker_size + (95 if is_selected else 0),
            color=color,
            edgecolors="#F7FBFF" if is_selected else "#132236",
            linewidths=2.0,
            zorder=6,
        )
        ax_dendro.text(
            x_data,
            marker_y,
            f"S{record['group_id']}",
            ha="center",
            va="center",
            fontsize=10,
            fontweight="bold",
            color="#07111D",
            zorder=7,
        )
        ax_dendro.text(
            x_data,
            marker_y + marker_offset * 0.72,
            f"{record['cluster_count']} IDs",
            ha="center",
            va="bottom",
            fontsize=8.3,
            color="#D9E8F8" if is_selected else "#9CB0C7",
            zorder=7,
        )

    ax_dendro.text(
        0.015,
        0.96,
        "Click a colored structure badge to add or remove that branch from the final contour run.",
        transform=ax_dendro.transAxes,
        ha="left",
        va="top",
        fontsize=11,
        color="#E9F2FD",
        bbox=dict(boxstyle="round,pad=0.35", facecolor="#101C2B", edgecolor="#1C3550", alpha=0.97),
    )
    ax_dendro.set_title("Interactive structure selector", loc="left", fontsize=15, color="#F5F9FF", pad=12)
    ax_dendro.set_ylabel("Cophenetic distance", color="#A8BCD3")
    leaf_positions = [5 + 10 * idx for idx in range(len(ordered_clusters))]
    if len(ordered_clusters) <= 18:
        tick_positions = leaf_positions
        tick_labels = ordered_clusters
    else:
        step = max(1, len(ordered_clusters) // 12)
        keep_indices = list(range(0, len(ordered_clusters), step))
        if keep_indices[-1] != len(ordered_clusters) - 1:
            keep_indices.append(len(ordered_clusters) - 1)
        tick_positions = [leaf_positions[idx] for idx in keep_indices]
        tick_labels = [ordered_clusters[idx] for idx in keep_indices]
    ax_dendro.set_xticks(tick_positions)
    ax_dendro.set_xticklabels(tick_labels, rotation=45, ha="right", fontsize=8, color="#A9BDD4")
    ax_dendro.tick_params(axis="x", colors="#90A6BF")
    ax_dendro.tick_params(axis="y", colors="#90A6BF")
    ax_dendro.set_xlabel("Cluster IDs ordered by the dendrogram", color="#A8BCD3", labelpad=10)
    for spine in ax_dendro.spines.values():
        spine.set_color("#20354A")
    ax_dendro.set_ylim(-marker_offset * 0.4, max_dist + marker_offset * 2.0)

    fig.canvas.draw()
    width, height = fig.canvas.get_width_height()
    for record in group_records:
        x_disp, y_disp = ax_dendro.transData.transform((float(record["x_data"]), float(record["marker_y"])))
        marker_positions[str(record["choice_label"])] = {
            "x": float(x_disp),
            "y": float(height - y_disp),
            "x_norm": float(x_disp / width),
            "y_norm": float((height - y_disp) / height),
            "radius": 32.0,
            "radius_norm": float(32.0 / max(width, height)),
        }

    fig.savefig(selector_path, dpi=180, facecolor=fig.get_facecolor())
    plt.close(fig)

    next_state = dict(group_state)
    next_state["selected_groups"] = list(selected_groups)
    next_state["marker_positions"] = marker_positions
    next_state["selector_path"] = str(selector_path)
    return selector_path, next_state


def resolve_clicked_structure(
    click_index: object,
    group_state: dict[str, object] | None,
) -> str | None:
    if not group_state:
        return None

    marker_positions = group_state.get("marker_positions", {})
    if not marker_positions:
        return None

    candidate_points: list[tuple[float, float]] = []
    if isinstance(click_index, dict):
        if "x" in click_index and "y" in click_index:
            candidate_points.append((float(click_index["x"]), float(click_index["y"])))
    elif isinstance(click_index, (list, tuple)) and len(click_index) >= 2:
        a = float(click_index[0])
        b = float(click_index[1])
        candidate_points.append((a, b))
        if abs(a - b) > 1:
            candidate_points.append((b, a))
    else:
        return None

    best_choice: str | None = None
    best_distance = float("inf")
    best_threshold = float("inf")
    for choice_label, marker in marker_positions.items():
        for x_click, y_click in candidate_points:
            if max(abs(x_click), abs(y_click)) <= 1.5:
                distance = float(np.hypot(x_click - marker["x_norm"], y_click - marker["y_norm"]))
                threshold = float(marker["radius_norm"]) * 1.6
            else:
                distance = float(np.hypot(x_click - marker["x"], y_click - marker["y"]))
                threshold = float(marker["radius"]) * 1.6

            if distance < best_distance:
                best_choice = str(choice_label)
                best_distance = distance
                best_threshold = threshold

    if best_choice is not None and best_distance <= best_threshold:
        return best_choice
    return None


def refresh_structure_selection(
    selected_groups: list[str] | None,
    group_state: dict[str, object] | None,
    note: str | None = None,
) -> tuple[str | None, pd.DataFrame, dict[str, object], str, str, dict[str, object]]:
    if not group_state:
        empty_table = build_group_table({}, [])
        return None, empty_table, gr.update(choices=[], value=[]), "", note or GROUP_SELECTION_EMPTY, {}

    normalized_groups = normalize_selected_groups(selected_groups, group_state)
    selector_path, next_state = render_structure_selector_image(group_state, normalized_groups)
    cluster_text, summary = update_clusters_to_outline_from_groups(normalized_groups, next_state)
    if note:
        summary = f"{summary}\n{note}"

    return (
        str(selector_path),
        build_group_table(next_state, normalized_groups),
        gr.update(choices=next_state["choices"], value=normalized_groups),
        cluster_text,
        summary,
        next_state,
    )


def toggle_structure_group_from_selector(
    group_state: dict[str, object] | None,
    evt: gr.SelectData,
) -> tuple[str | None, pd.DataFrame, dict[str, object], str, str, dict[str, object]]:
    if not group_state:
        empty_table = build_group_table({}, [])
        return None, empty_table, gr.update(choices=[], value=[]), "", GROUP_SELECTION_EMPTY, {}

    current_groups = normalize_selected_groups(group_state.get("selected_groups", []), group_state)
    clicked_choice = resolve_clicked_structure(getattr(evt, "index", None), group_state)
    if clicked_choice is None:
        return refresh_structure_selection(
            current_groups,
            group_state,
            note="Click directly on one of the colored badges labelled S1, S2, S3, ... to toggle a structure.",
        )

    next_groups = list(current_groups)
    if clicked_choice in next_groups:
        next_groups = [choice for choice in next_groups if choice != clicked_choice]
    else:
        next_groups.append(clicked_choice)

    return refresh_structure_selection(next_groups, group_state)


def clear_structure_selection(
    group_state: dict[str, object] | None,
) -> tuple[str | None, pd.DataFrame, dict[str, object], str, str, dict[str, object]]:
    return refresh_structure_selection([], group_state, note="Selection cleared. Choose one or more structures to continue.")


def build_selected_structure_specs(
    raw_groups_text: str,
    selected_groups: list[str] | None,
    group_state: dict[str, object] | None,
) -> list[dict[str, object]]:
    selected_records: list[dict[str, object]] = []