GNNVerifier/main.py at main · BUPT-GAMMA/GNNVerifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import argparse
import torch
import prettytable as pt
import json
import os
import numpy as np
import hashlib
import time
import math
import asyncio
from openai import OpenAI
import sys
sys.path.append("../")
from model import ModelTrainer
from utils_preproc import order_chain_with_steps_and_edges, clean_step_texts, edges_to_links
from utils import init_random_state, load_test_data, get_cur_time, prepare_training_ids
from evaluate import f1_score

_api_key = os.environ.get("OPENAI_API_KEY")
_api_base_url = os.environ.get("OPENAI_API_BASE")
client = OpenAI(api_key=_api_key, base_url=_api_base_url)

TAU_ACCEPT = 0.9
DELTA_IMPROVE = 0.02
THETA_NODE = 0.5
THETA_GAP = 0.5
_IO_CACHE = {}
LLM_MAX_RETRIES = 5
LLM_RETRY_BASE_SEC = 1.0
LLM_RETRY_MAX_SEC = 10.0
LLM_MIN_INTERVAL_SEC = 0.5
_LLM_LAST_CALL_TS = 0.0


def _throttle_llm_calls():
    global _LLM_LAST_CALL_TS
    now = time.time()
    wait = LLM_MIN_INTERVAL_SEC - (now - _LLM_LAST_CALL_TS)
    if wait > 0:
        time.sleep(wait)
    _LLM_LAST_CALL_TS = time.time()


def _tool_desc_map(tool_meta):
    return {n["id"]: n.get("desc", n.get("description", "")) for n in tool_meta["nodes"]}

def _format_steps_for_prompt(steps):
    cleaned = clean_step_texts(steps) if steps is not None else []
    formatted = []
    for i, s in enumerate(cleaned):
        if s:
            formatted.append(f"Step {i+1}: {s}")
        else:
            formatted.append(f"Step {i+1}:")
    return formatted

def _safe_int(v, default=-1):
    if isinstance(v, int):
        return v
    if isinstance(v, (float, str)):
        try:
            return int(float(v)) if isinstance(v, str) and "." in v else int(v)
        except (ValueError, TypeError):
            return default
    return default


def parse_edit_ops(llm_output):
    if not llm_output:
        return []
    edits = llm_output.get("edits") if isinstance(llm_output, dict) else None
    if not edits:
        return []
    ops = []
    for item in edits:
        if isinstance(item, dict):
            op = item.get("op")
            if op in ("revert", "no_change", "keep_plan"):
                ops.append({"op": "no_change", "raw": item})
            elif op == "replace_node":
                ops.append({"op": "replace_node", "node_id": _safe_int(item.get("node_id"), -1),
                            "candidate_id": _safe_int(item.get("candidate_id"), -1),
                            "step": item.get("step", "")})
            elif op == "insert_on_gap":
                ops.append({"op": "insert_on_gap", "gap_id": _safe_int(item.get("gap_id"), -1),
                            "candidate_id": _safe_int(item.get("candidate_id"), -1),
                            "step": item.get("step", "")})
            continue
        if not isinstance(item, str):
            continue
        s = item.strip()
        if s.startswith("revert") or s.startswith("no_change") or s.startswith("keep_plan"):
            ops.append({"op": "no_change", "raw": s})
            continue
        if s.startswith("replace_node"):
            try:
                inner = s[s.find("(") + 1:s.rfind(")")]
                node_id, cand_id = [int(x.strip()) for x in inner.split(",")]
                ops.append({"op": "replace_node", "node_id": node_id, "candidate_id": cand_id, "step": ""})
            except Exception:
                continue
        if s.startswith("insert_on_gap"):
            try:
                inner = s[s.find("(") + 1:s.rfind(")")]
                gap_id, cand_id = [int(x.strip()) for x in inner.split(",")]
                ops.append({"op": "insert_on_gap", "gap_id": gap_id, "candidate_id": cand_id, "step": ""})
            except Exception:
                continue
    return ops


def validate_edit_ops(ops, node_candidates, gap_candidates):
    errors = []
    if not ops:
        return True, []
    if any(op.get("op") == "no_change" for op in ops):
        return True, []
    if len(ops) > 3:
        errors.append("too_many_ops")
    node_map = {n["idx"]: n for n in node_candidates}
    gap_map = {g["gap_id"]: g for g in gap_candidates}
    chosen_tools = set()
    for op in ops:
        if op.get("op") == "replace_node":
            node_id = op.get("node_id")
            cand_id = op.get("candidate_id")
            if node_id not in node_map:
                errors.append(f"invalid_node_id:{node_id}")
                continue
            step_text = op.get("step", "")
            if not isinstance(step_text, str) or not step_text.strip():
                errors.append(f"missing_step_replace:{node_id}")
            cand_list = node_map[node_id].get("candidate_tools", [])
            if cand_id is None or cand_id < 0 or cand_id >= len(cand_list):
                errors.append(f"invalid_candidate_id:{cand_id}")
            else:
                tool = cand_list[cand_id]
                if tool in chosen_tools:
                    errors.append(f"duplicate_tool:{tool}")
                chosen_tools.add(tool)
        elif op.get("op") == "insert_on_gap":
            gap_id = op.get("gap_id")
            cand_id = op.get("candidate_id")
            if gap_id not in gap_map:
                errors.append(f"invalid_gap_id:{gap_id}")
                continue
            step_text = op.get("step", "")
            if not isinstance(step_text, str) or not step_text.strip():
                errors.append(f"missing_step_insert:{gap_id}")
            cand_list = gap_map[gap_id].get("candidate_tools", [])
            if cand_id is None or cand_id < 0 or cand_id >= len(cand_list):
                errors.append(f"invalid_candidate_id:{cand_id}")
            else:
                tool = cand_list[cand_id]
                if tool in chosen_tools:
                    errors.append(f"duplicate_tool:{tool}")
                chosen_tools.add(tool)
        else:
            errors.append(f"unknown_op:{op.get('op')}")
    return len(errors) == 0, errors


def apply_edit_ops(plan, ops, node_candidates, gap_candidates, tool_meta):
    if not ops or any(op.get("op") == "no_change" for op in ops):
        return plan
    nodes = list(plan.get("nodes", []))
    steps = list(plan.get("steps", []))
    node_map = {n["idx"]: n for n in node_candidates}
    gap_map = {g["gap_id"]: g for g in gap_candidates}
    desc_map = _tool_desc_map(tool_meta)

    replace_ops = [op for op in ops if op.get("op") == "replace_node"]
    insert_ops = [op for op in ops if op.get("op") == "insert_on_gap"]

    for op in sorted(replace_ops, key=lambda x: x.get("node_id", 0)):
        if op.get("op") == "replace_node":
            node_id = op["node_id"]
            cand_id = op["candidate_id"]
            cand_list = node_map[node_id].get("candidate_tools", [])
            new_tool = cand_list[cand_id]
            nodes[node_id] = new_tool
            step_text = op.get("step", "")
            if node_id < len(steps):
                steps[node_id] = step_text if step_text else steps[node_id]

    offset = 0
    def _gap_vpos(op):
        return gap_map.get(op.get("gap_id"), {}).get("v_pos", 0)

    for op in sorted(insert_ops, key=_gap_vpos):
        gap_id = op["gap_id"]
        cand_id = op["candidate_id"]
        gap = gap_map[gap_id]
        cand_list = gap.get("candidate_tools", [])
        new_tool = cand_list[cand_id]
        insert_pos = gap.get("v_pos", len(nodes)) + offset
        nodes.insert(insert_pos, new_tool)
        step_text = op.get("step", "")
        steps.insert(insert_pos, step_text if step_text else f"Step {insert_pos+1}: Call {new_tool}")
        offset += 1

    fixed_steps = []
    for i, tool in enumerate(nodes):
        if i < len(steps) and steps[i]:
            fixed_steps.append(steps[i])
        else:
            desc = desc_map.get(tool, "")
            fixed_steps.append(f"Step {i+1}: {desc[:100]}" if desc else f"Step {i+1}: Call {tool}")

    edges = [(i, i+1) for i in range(len(nodes) - 1)] if len(nodes) > 1 else []
    return {"nodes": nodes, "edges": edges, "steps": fixed_steps}


def build_tool_string(tool_meta):
    s = "# TASK LIST #:\n"
    for tool in tool_meta["nodes"]:
        s += json.dumps(tool, ensure_ascii=False) + "\n"
    return s


def _normalize_tool_key(name):
    if not name:
        return ""
    return "".join(c for c in name.lower() if c.isalnum())


def _build_tool_alias_map_from_nodes(nodes):
    alias = {}
    for node in nodes:
        tid = node.get("id")
        if tid:
            alias[_normalize_tool_key(tid)] = tid
    return alias


def normalize_tools_list(tools, alias_map):
    out = []
    for t in tools or []:
        norm = _normalize_tool_key(t)
        out.append(alias_map.get(norm, t))
    return out


def get_io_sets(tool_meta):
    cache_key = id(tool_meta)
    if cache_key in _IO_CACHE:
        return _IO_CACHE[cache_key]
    in_map = {}
    out_map = {}
    for node in tool_meta["nodes"]:
        tid = node["id"]
        in_types = node.get("input-type", [])
        out_types = node.get("output-type", [])
        if isinstance(in_types, str):
            in_types = [in_types] if in_types else []
        if isinstance(out_types, str):
            out_types = [out_types] if out_types else []
        in_map[tid] = set(in_types)
        out_map[tid] = set(out_types)
    _IO_CACHE[cache_key] = (in_map, out_map)
    return in_map, out_map


def io_compat(out_set, in_set):
    if not out_set or not in_set:
        return True
    return len(out_set & in_set) > 0


def cosine_sim(a, b):
    if a is None or b is None:
        return 0.0
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a < 1e-8 or norm_b < 1e-8:
        return 0.0
    return float(np.dot(a, b) / (norm_a * norm_b))


def encode_text(cache, text, prefix=None):
    if not text:
        return None
    if prefix:
        embs = cache.encode_texts([text], prefix=prefix)
    else:
        embs = cache.encode_texts([text])
    return embs[0] if len(embs) > 0 else None


def links_to_edges(nodes, links):
    if not nodes:
        return []
    if not links:
        return [(i, i+1) for i in range(len(nodes)-1)] if len(nodes) > 1 else []

    if isinstance(links, list) and links and isinstance(links[0], dict):
        if "source" in links[0] and "target" in links[0]:
            links = [f'{x.get("source", "")}, {x.get("target", "")}' for x in links]

    if isinstance(links, list) and links and isinstance(links[0], (list, tuple)):
        edges = []
        for edge in links:
            if len(edge) == 2:
                u, v = edge[0], edge[1]
                if isinstance(u, int) and isinstance(v, int) and 0 <= u < len(nodes) and 0 <= v < len(nodes):
                    edges.append((u, v))
        return edges if edges else [(i, i+1) for i in range(len(nodes)-1)] if len(nodes) > 1 else []

    pos_map = {}
    for i, t in enumerate(nodes):
        pos_map.setdefault(t, []).append(i)

    edges = []
    for link in links:
        if not isinstance(link, str):
            continue
        parts = link.split(", ")
        if len(parts) != 2:
            continue
        u_name, v_name = parts[0].strip(), parts[1].strip()

        if u_name not in pos_map or v_name not in pos_map:
            continue

        u_positions = pos_map[u_name]
        v_positions = pos_map[v_name]

        best = None
        best_score = float('inf')
        for u in u_positions:
            for v in v_positions:
                if u == v:
                    continue
                if v > u:
                    score = v - u
                else:
                    score = 10000 + abs(v - u)
                if score < best_score:
                    best_score = score
                    best = (u, v)

        if best is not None:
            edges.append(best)

    return edges if edges else [(i, i+1) for i in range(len(nodes)-1)] if len(nodes) > 1 else []


def add_start_edge(edges_tool_idx, num_nodes):
    if num_nodes <= 0:
        return []
    in_degree = [0] * num_nodes
    for u, v in edges_tool_idx:
        if 0 <= u < num_nodes and 0 <= v < num_nodes:
            in_degree[v] += 1
    root_idx = None
    for i in range(num_nodes):
        if in_degree[i] == 0:
            root_idx = i
            break
    edges_gnn = [(u + 1, v + 1) for (u, v) in edges_tool_idx]
    if root_idx is not None:
        edges_gnn.insert(0, (0, root_idx + 1))
    return edges_gnn


def suggest_replacement_tools(controller, tool_meta, confusion, current_nodes, current_steps,
                               node_idx, user_request="", topn=3):
    if node_idx < 0 or node_idx >= len(current_nodes):
        return []

    current_tool = current_nodes[node_idx]
    step_text = current_steps[node_idx] if node_idx < len(current_steps) else ""
    if step_text:
        step_text = clean_step_texts([step_text])[0]

    top_k = confusion.get("top_k", {}).get(current_tool, [])
    if not top_k:
        return []
    used_tools = set(current_nodes)
    in_map, out_map = get_io_sets(tool_meta)
    pred_out = set()
    succ_in = set()
    if node_idx > 0:
        pred_tool = current_nodes[node_idx - 1]
        pred_out = out_map.get(pred_tool, set())
    if node_idx < len(current_nodes) - 1:
        succ_tool = current_nodes[node_idx + 1]
        succ_in = in_map.get(succ_tool, set())

    cache = controller._embedding_cache
    step_emb = encode_text(cache, step_text, prefix="query") if step_text else None
    req_emb = encode_text(cache, user_request, prefix="query") if user_request else None
    align_q = None
    if controller.align_pretrained and step_text:
        with torch.no_grad():
            step_emb_t = controller._encode_text_to_emb_cached([step_text], prefix="query")
            align_q = controller._project_step(step_emb_t)[0]

    filtered = []
    for item in top_k:
        cand_tid = item[0] if isinstance(item, (list, tuple)) else item
        conf_score = item[1] if isinstance(item, (list, tuple)) and len(item) > 1 else 0.0

        if cand_tid == current_tool or cand_tid in used_tools:
            continue

        cand_in = in_map.get(cand_tid, set())
        cand_out = out_map.get(cand_tid, set())

        if pred_out and not io_compat(pred_out, cand_in):
            continue
        if succ_in and not io_compat(cand_out, succ_in):
            continue

        filtered.append((cand_tid, conf_score))

    if not filtered:
        return []

    filtered = sorted(filtered, key=lambda x: x[1], reverse=True)[:10]

    candidates = []
    for cand_tid, conf_score in filtered:
        q_proxy = 0.0
        align_score = None
        align_sig = None
        sim_req = 0.0
        if controller.align_pretrained and align_q is not None:
            with torch.no_grad():
                cand_emb_t = controller._get_tool_embedding(cand_tid)
                k = controller._project_tool(cand_emb_t.unsqueeze(0))[0]
                g = torch.dot(align_q, k) / float(controller.align_tau)
                align_score = float(g.item())
                align_sig = float(torch.sigmoid(g).item())
        else:
            if step_emb is not None:
                cand_emb = cache._tool_embeddings.get(cand_tid)
                if cand_emb is not None:
                    q_proxy = cosine_sim(step_emb, cand_emb)
        if req_emb is not None:
            cand_emb = cache._tool_embeddings.get(cand_tid)
            if cand_emb is not None:
                sim_req = cosine_sim(req_emb, cand_emb)

        step_tool_score = align_sig if align_sig is not None else q_proxy
        score = 0.5 * step_tool_score + 0.5 * sim_req
        if align_score is not None:
            reason = (
                f"IO compatible, align={align_score:.2f}, align_sig={step_tool_score:.2f}, "
                f"confusion={conf_score:.2f}, req_align={sim_req:.2f}"
            )
        else:
            reason = f"IO compatible, confusion={conf_score:.2f}, step_match={step_tool_score:.2f}, req_align={sim_req:.2f}"
        candidates.append({"tool": cand_tid, "reason": reason, "score": score})

    candidates.sort(key=lambda x: x["score"], reverse=True)
    return candidates[:topn]


def suggest_insertion_tools(controller, tool_meta, typed_ngrams, current_nodes,
                            gap_u_pos, gap_v_pos, user_request, gap_risk=0.0, topn=3):
    if gap_v_pos < 0 or gap_v_pos >= len(current_nodes):
        return []

    in_map, out_map = get_io_sets(tool_meta)

    if gap_u_pos >= 0 and gap_u_pos < len(current_nodes):
        u_tool = current_nodes[gap_u_pos]
        u_out = out_map.get(u_tool, set())
    else:
        u_tool = "START"
        u_out = set()

    v_tool = current_nodes[gap_v_pos]
    v_in = in_map.get(v_tool, set())
    cache = controller._embedding_cache
    req_emb = encode_text(cache, user_request, prefix="query") if user_request else None
    f2 = typed_ngrams.get("__f2__", {})
    candidates = []
    used_tools = set(current_nodes)
    all_tools = [n["id"] for n in tool_meta["nodes"] if n["id"] not in used_tools]

    for cand_tid in all_tools:
        if cand_tid in current_nodes:
            continue

        cand_in = in_map.get(cand_tid, set())
        cand_out = out_map.get(cand_tid, set())

        if u_out and not io_compat(u_out, cand_in):
            continue
        if v_in and not io_compat(cand_out, v_in):
            continue

        score_ng = 0.0
        if u_tool != "START":
            score_ng += math.log1p(f2.get((u_tool, cand_tid), 0))
        score_ng += math.log1p(f2.get((cand_tid, v_tool), 0))

        score_req = 0.0
        if req_emb is not None:
            cand_emb = cache._tool_embeddings.get(cand_tid)
            if cand_emb is not None:
                score_req = cosine_sim(req_emb, cand_emb)

        score = 0.6 * score_req + 0.2 * math.log1p(f2.get((u_tool, cand_tid), 0)) + 0.2 * math.log1p(f2.get((cand_tid, v_tool), 0))
        score = score * (1.0 + 0.5 * float(gap_risk))

        reason = f"IO bridge OK, ngram={score_ng:.2f}, req_align={score_req:.2f}"
        candidates.append({"tool": cand_tid, "reason": reason, "score": score})

    candidates.sort(key=lambda x: x["score"], reverse=True)
    return candidates[:topn]


def format_candidates_for_prompt(candidates):
    if not candidates:
        return "[]"
    formatted = []
    for c in candidates:
        formatted.append({"tool": c["tool"], "score": round(c["score"], 3)})
    return json.dumps(formatted, ensure_ascii=False)


def build_tool_gap_risks(gnn_result, num_tools, current_nodes=None):
    gaps = gnn_result.get("gaps", []) or []
    gap_risks = gnn_result.get("gap_risks", []) or []
    out = []

    for i, edge in enumerate(gaps):
        if not isinstance(edge, (list, tuple)) or len(edge) != 2:
            continue
        u_idx, v_idx = edge
        if not isinstance(u_idx, int) or not isinstance(v_idx, int):
            continue

        risk = float(gap_risks[i]) if i < len(gap_risks) else 0.0

        if u_idx == 0:
            u_name = "START"
            u_pos = -1
        else:
            u_pos = u_idx - 1
            if 0 <= u_pos < num_tools and current_nodes:
                u_name = current_nodes[u_pos]
            else:
                continue

        v_pos = v_idx - 1
        if 0 <= v_pos < num_tools and current_nodes:
            v_name = current_nodes[v_pos]
        else:
            continue

        out.append((u_name, v_name, risk, u_pos, v_pos))

    out.sort(key=lambda x: x[2], reverse=True)
    return out


def build_full_gnn_report(gnn_result, current_plan):
    nodes = current_plan.get("nodes", [])
    edges = current_plan.get("edges", [])
    steps = current_plan.get("steps", [])
    num_tools = len(nodes)

    node_risks = gnn_result.get("node_risks", []) or []
    gap_risks = gnn_result.get("gap_risks", []) or []
    gaps = gnn_result.get("gaps", []) or []

    gap_named = []
    for i, edge in enumerate(gaps):
        if not isinstance(edge, (list, tuple)) or len(edge) != 2:
            continue
        u_idx, v_idx = edge
        if not isinstance(u_idx, int) or not isinstance(v_idx, int):
            continue
        if u_idx == 0:
            u_name = "START"
        else:
            u_pos = u_idx - 1
            u_name = nodes[u_pos] if 0 <= u_pos < num_tools else f"IDX_{u_idx}"
        v_pos = v_idx - 1
        v_name = nodes[v_pos] if 0 <= v_pos < num_tools else f"IDX_{v_idx}"
        risk = float(gap_risks[i]) if i < len(gap_risks) else 0.0
        gap_named.append({"u": u_name, "v": v_name, "risk": risk, "u_idx": u_idx, "v_idx": v_idx})

    return {
        "S": float(gnn_result.get("S", 0.0)),
        "nodes": nodes,
        "edges": edges,
        "steps": steps,
        "node_risks": node_risks,
        "gap_risks": gap_risks,
        "gaps": gaps,
        "gaps_named": gap_named
    }


def robust_json_extract(text):
    if not text:
        return None

    text = text.strip()

    if "```json" in text:
        match = text.split("```json")[1].split("```")[0] if "```json" in text else text
        text = match.strip()
    elif "```" in text:
        match = text.split("```")[1].split("```")[0] if text.count("```") >= 2 else text
        text = match.strip()

    if "{" in text and "}" in text:
        start = text.find("{")
        end = text.rfind("}") + 1
        text = text[start:end]

    text = text.replace("\\_", "_")
    text = text.replace(",]", "]").replace(",}", "}")

    try:
        return json.loads(text)
    except:
        return None


def call_llm_patch(user_request, tool_meta, confusion, typed_ngrams, controller,
                   current_plan, gnn_result,
                   theta_node=THETA_NODE, theta_gap=THETA_GAP,
                   temperature=0.2, max_tokens=1200, llm_cache=None,
                   cache_key_prefix=None, trace_payload=None):
    tool_list_str = build_tool_string(tool_meta)

    current_nodes = current_plan.get("nodes", [])
    current_steps = current_plan.get("steps", [])
    current_edges = current_plan.get("edges", [])

    current_plan_json = {
        "task_steps": _format_steps_for_prompt(current_steps),
        "task_nodes": current_nodes,
        "task_links": [[e[0], e[1]] for e in current_edges]
    }

    S = gnn_result.get("S", 0.0)
    node_risks = gnn_result.get("node_risks", [])

    indexed_risks = [(idx, risk) for idx, risk in enumerate(node_risks)]
    indexed_risks.sort(key=lambda x: x[1], reverse=True)
    high_nodes = [item for item in indexed_risks if item[1] >= theta_node]

    node_diag_lines = []
    node_candidates = []
    for idx, risk in high_nodes:
        if 0 <= idx < len(current_nodes):
            tool = current_nodes[idx]
            step = current_steps[idx] if idx < len(current_steps) else ""
            candidates = suggest_replacement_tools(
                controller, tool_meta, confusion, current_nodes, current_steps, idx,
                user_request=user_request, topn=3
            )
            cand_str = format_candidates_for_prompt(candidates)
            cand_list = [c["tool"] for c in candidates]
            node_diag_lines.append(
                f"  - Node {idx} ({tool}): risk={risk:.3f}, step=\"{step[:40]}...\"\n"
                f"    replacement_candidates: {cand_str}"
            )
            node_candidates.append({
                "idx": idx,
                "tool": tool,
                "risk": float(risk),
                "step": step,
                "candidates": candidates,
                "candidate_tools": cand_list
            })
    node_diag_str = "\n".join(node_diag_lines) if node_diag_lines else "  None"

    num_tools = len(current_nodes)
    tool_gap_risks = build_tool_gap_risks(gnn_result, num_tools, current_nodes)
    high_gaps = [item for item in tool_gap_risks if item[2] >= theta_gap]

    gap_diag_lines = []
    gap_candidates = []
    for item in high_gaps:
        u_name, v_name, risk, u_pos, v_pos = item
        candidates = suggest_insertion_tools(
            controller, tool_meta, typed_ngrams, current_nodes, u_pos, v_pos,
            user_request, gap_risk=risk, topn=3
        )
        cand_str = format_candidates_for_prompt(candidates)
        cand_list = [c["tool"] for c in candidates]
        gap_diag_lines.append(
            f"  - Gap ({u_name} -> {v_name}): risk={risk:.3f}\n"
            f"    insertion_candidates: {cand_str}"
        )
        gap_candidates.append({
            "gap_id": len(gap_candidates),
            "u_name": u_name,
            "v_name": v_name,
            "risk": float(risk),
            "u_pos": u_pos,
            "v_pos": v_pos,
            "candidates": candidates,
            "candidate_tools": cand_list
        })
    gap_diag_str = "\n".join(gap_diag_lines) if gap_diag_lines else "  None"

    if node_candidates:
        node_summary_str = " ".join(
            [
                f"Node {n['idx']} ({n['tool']}) has a high risk score ({n['risk']:.2f}), suggesting it might be irrelevant or incorrect for the user request."
                for n in node_candidates
            ]
        )
    else:
        node_summary_str = "No high-risk nodes detected."
    if gap_candidates:
        gap_summary_str = " ".join(
            [
                f"Gap {g['gap_id']} ({g['u_name']} -> {g['v_name']}) has a high risk score ({g['risk']:.2f}), suggesting a missing step between these nodes."
                for g in gap_candidates
            ]
        )
    else:
        gap_summary_str = "No high-risk gaps detected."

    full_gnn_report = build_full_gnn_report(gnn_result, current_plan)
    prompt = f"""{tool_list_str}

# USER REQUEST #
{user_request}

# CURRENT PLAN (JSON) #
{json.dumps(current_plan_json, ensure_ascii=False, indent=2)}

# GNN EVALUATION #
The GNN is a plan evaluator and reports:
- Graph score S ∈ [0,1]: higher suggests stronger alignment with the user request.
- Node risk ∈ [0,1]: higher indicates a node may use an incorrect tool.
- Gap risk ∈ [0,1]: higher indicates a gap (including START→first node and tool-to-tool edges) may be incomplete and need an inserted tool.

# TOP RISK NODES #
{node_diag_str}

# TOP RISK GAPS #
{gap_diag_str}

# RISK SUMMARY #
Node summary: {node_summary_str}
Gap summary: {gap_summary_str}

# FULL GNN ANALYSIS (JSON) #
{json.dumps(full_gnn_report, ensure_ascii=False)}

# GOAL #
You are a plan refinement assistant. Analyze the user request, the current plan, and the GNN diagnostics to decide whether and how to improve the plan so it better satisfies the request.

# Common errors are mainly two types #
1) Wrong tool choice: a tool is semantically similar but incorrect → replace the node.
2) Missing step: especially missing preprocessing → insert a tool on the risky gap.

# TASK #
Think step by step internally. Based on the user request, the current plan, and the GNN diagnostics, first decide whether any change is necessary.
If no change is needed, return empty edits. If changes are needed, select replacement/insertion tools only from the provided candidates and propose minimal edits.
If improvements are not clear with the provided candidates, do not modify.
Return EDIT OPERATIONS only (no analysis text). It is valid to return no edits.

# RULES #
1. Output JSON ONLY (no extra text).
2. Modify at most 3 places; 0 is allowed.
3. Do not use the same candidate tool in multiple edits.
4. Allowed ops:
   - replace_node(node_id, candidate_id, step)
   - insert_on_gap(gap_id, candidate_id, step)
   - no_change()
5. candidate_id must be an integer index from the candidate list.
6. Candidate order is arbitrary; read each candidate's tool description in tool_list_str and compare carefully.
7. insert_on_gap must use the gap_id from the list below and only inserts between (u_id, v_id).
8. replace_node must use node_id from the list below and only replaces the tool at that node.
9. Each node/gap may remain unchanged; prefer fewer edits and only change when necessary.
10. For every edit, provide a new step text aligned with the chosen tool and request.
11. The updated steps/tools should solve the request better than the current plan; otherwise do not modify.
12. For any edits, keep steps aligned 1-to-1 with nodes (same count, same order).

# CANDIDATES (node_id -> candidate_id -> tool) #
{json.dumps([{ "node_id": n["idx"], "candidates": n["candidate_tools"] } for n in node_candidates], ensure_ascii=False)}

# CANDIDATES (gap_id -> candidate_id -> tool) #
{json.dumps([{ "gap_id": g["gap_id"], "u_pos": g["u_pos"], "v_pos": g["v_pos"], "candidates": g["candidate_tools"] } for g in gap_candidates], ensure_ascii=False)}

# OUTPUT FORMAT (minimal edits) #
{{
  "edits": [
    {{"op":"replace_node","node_id":0,"candidate_id":1,"step":"Step 1: ..."}},
    {{"op":"insert_on_gap","gap_id":0,"candidate_id":2,"step":"Step 2: ..."}}
  ]
}}
If the current workflow is already optimal, return: {{"edits":[]}}
"""

    prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest()
    if trace_payload is not None:
        trace_payload["prompt_hash"] = prompt_hash
        trace_payload["node_candidates"] = node_candidates
        trace_payload["gap_candidates"] = gap_candidates
    cache_key = None
    if llm_cache is not None and cache_key_prefix is not None:
        cache_key = (cache_key_prefix[0], cache_key_prefix[1], prompt_hash)
        if isinstance(llm_cache, dict):
            cached = llm_cache.get(cache_key)
        else:
            cached = llm_cache.get(cache_key) if hasattr(llm_cache, "get") else None
        if cached is not None:
            return cached

    for attempt in range(1, LLM_MAX_RETRIES + 1):
        try:
            _throttle_llm_calls()
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            content = response.choices[0].message.content.strip()
            result = robust_json_extract(content)

            if not result:
                retry_prompt = "只输出严格JSON，不要任何解释：\n" + prompt
                _throttle_llm_calls()
                response = client.chat.completions.create(
                    model=LLM_NAME,
                    messages=[{"role": "user", "content": retry_prompt}],
                    temperature=0.0,
                    max_tokens=max_tokens
                )
                result = robust_json_extract(response.choices[0].message.content)

            if result is not None:
                if llm_cache is not None and cache_key is not None:
                    if isinstance(llm_cache, dict):
                        llm_cache[cache_key] = result
                    elif hasattr(llm_cache, "set"):
                        llm_cache.set(cache_key, result, data_id=cache_key[0],
                                      strategy=cache_key[1], prompt_hash=prompt_hash)
                return result
        except Exception:
            if attempt == LLM_MAX_RETRIES:
                break
        time.sleep(min(LLM_RETRY_BASE_SEC * (2 ** (attempt - 1)), LLM_RETRY_MAX_SEC))
    return None


def call_llm_patch_fix(user_request, error_msg, node_candidates, gap_candidates,
                       temperature=0.2, max_tokens=600):
    prompt = f"""# USER REQUEST #
{user_request}

# ERROR #
{error_msg}

# CANDIDATES (node_id -> candidate_id -> tool) #
{json.dumps([{ "node_id": n["idx"], "candidates": n["candidate_tools"] } for n in node_candidates], ensure_ascii=False)}

# CANDIDATES (gap_id -> candidate_id -> tool) #
{json.dumps([{ "gap_id": g["gap_id"], "u_pos": g["u_pos"], "v_pos": g["v_pos"], "candidates": g["candidate_tools"] } for g in gap_candidates], ensure_ascii=False)}

# TASK #
Fix the edit operations to satisfy all constraints.
Only output:
{{"edits":[
  {{"op":"replace_node","node_id":0,"candidate_id":1,"step":"Step 1: ..."}},
  {{"op":"insert_on_gap","gap_id":0,"candidate_id":2,"step":"Step 2: ..."}}
]}}
Or {{\"edits\":[]}} / no_change().
"""
    for attempt in range(1, LLM_MAX_RETRIES + 1):
        try:
            _throttle_llm_calls()
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            content = response.choices[0].message.content.strip()
            result = robust_json_extract(content)
            if result:
                return result
        except Exception:
            if attempt == LLM_MAX_RETRIES:
                break
        time.sleep(min(LLM_RETRY_BASE_SEC * (2 ** (attempt - 1)), LLM_RETRY_MAX_SEC))
    return None


def iterative_refine_with_llm(controller, user_request, init_plan, init_gnn,
                                  tool_meta, confusion, typed_ngrams, allowed_tools,
                                  threshold_accept=TAU_ACCEPT,
                                  llm_temperature=0.2, llm_cache=None,
                                  data_id=None, gt_nodes=None, gt_links=None,
                                  alias_map=None):
    history = {"init": {}}
    gt_payload = None
    if gt_nodes is not None or gt_links is not None:
        gt_payload = {
            "gt_nodes": gt_nodes,
            "gt_links": gt_links
        }

    base_S = float(init_gnn.get("S", 0.0))
    candidates = [(base_S, init_plan, init_gnn, "init")]

    best_plan = init_plan
    best_gnn = init_gnn
    best_S = base_S

    history["init"] = {
        "strategy": "init",
        "S": base_S,
        "nodes": init_plan.get("nodes", []),
        "steps": init_plan.get("steps", []),
        "edges": init_plan.get("edges", [])
    }

    if float(best_gnn.get("S", 0.0)) >= threshold_accept:
        return best_plan, "accept", history

    strategy = "patch"
    trace_payload = {}
    llm_output = call_llm_patch(
        user_request, tool_meta, confusion, typed_ngrams, controller,
        best_plan, best_gnn,
        theta_node=THETA_NODE,
        theta_gap=THETA_GAP,
        temperature=llm_temperature,
        llm_cache=llm_cache,
        cache_key_prefix=(data_id, "patch") if data_id is not None else None,
        trace_payload=trace_payload
    )

    if not llm_output:
        return best_plan, "rollback", history

    ops = parse_edit_ops(llm_output)
    node_candidates = trace_payload.get("node_candidates", []) or []
    gap_candidates = trace_payload.get("gap_candidates", []) or []
    trace_payload["edit_ops"] = ops
    valid, errors = validate_edit_ops(ops, node_candidates, gap_candidates)
    if not valid:
        fix = call_llm_patch_fix(
            user_request,
            ";".join(errors),
            node_candidates,
            gap_candidates,
            temperature=llm_temperature
        )
        if fix:
            ops = parse_edit_ops(fix)
            valid, errors = validate_edit_ops(ops, node_candidates, gap_candidates)
    if not valid:
        return best_plan, "rollback", history
    new_plan = apply_edit_ops(best_plan, ops, node_candidates, gap_candidates, tool_meta)
    if new_plan is None:
        return best_plan, "rollback", history

    if not new_plan["nodes"]:
        return best_plan, "rollback", history

    ordered_tools, ordered_steps, ordered_edges = order_chain_with_steps_and_edges(
        new_plan["nodes"], new_plan.get("steps", []), new_plan.get("edges", [])
    )
    new_plan["nodes"] = ordered_tools
    new_plan["steps"] = ordered_steps
    new_plan["edges"] = ordered_edges

    new_edges_gnn = add_start_edge(new_plan["edges"], len(new_plan["nodes"])) if new_plan["nodes"] else []
    new_gnn = controller.score_chain(
        new_plan["nodes"], user_request,
        edges=new_edges_gnn,
        step_texts=new_plan["steps"]
    )
    new_S = float(new_gnn.get("S", 0.0))

    quality_before = quality_after = None
    if gt_nodes is not None and gt_links is not None:
        quality_before = f1_score(best_plan.get("nodes", []), gt_nodes)
        quality_after = f1_score(new_plan.get("nodes", []), gt_nodes)

    stability_ok = True
    if quality_before is not None and quality_after is not None:
        stability_ok = (quality_after - quality_before) >= 0.0

    if not stability_ok:
        return best_plan, "rollback", history

    candidates.append((new_S, new_plan, new_gnn, strategy))
    history["patch"] = {
        "strategy": strategy,
        "S": new_S,
        "nodes": new_plan["nodes"],