N.E.K.O/launcher.py at main · StarrySerendipity/N.E.K.O · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
N.E.K.O. 统一启动器
启动所有服务器，等待它们准备就绪后启动主程序，并监控主程序状态
"""
from __future__ import annotations

import sys
import os
import io
import signal

# 强制 UTF-8 编码
if sys.platform == 'win32':
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
    sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')

# 处理 PyInstaller 和 Nuitka 打包后的路径
if getattr(sys, 'frozen', False):
    # 运行在打包后的环境
    if hasattr(sys, '_MEIPASS'):
        # PyInstaller
        bundle_dir = sys._MEIPASS
    else:
        # Nuitka 或其他
        bundle_dir = os.path.dirname(os.path.abspath(__file__))

else:
    # 运行在正常 Python 环境
    bundle_dir = os.path.dirname(os.path.abspath(__file__))

sys.path.insert(0, bundle_dir)
os.chdir(bundle_dir)

import subprocess
import socket
import time
import threading
import itertools
import ctypes
import atexit
import signal
import json
import logging
import uuid
import importlib
from datetime import datetime, timezone
from typing import Dict
from multiprocessing import Process, freeze_support, Event
import config as config_module
from config import APP_NAME, MAIN_SERVER_PORT, MEMORY_SERVER_PORT, TOOL_SERVER_PORT
from utils.port_utils import (
    probe_neko_health,
    acquire_startup_lock,
    release_startup_lock,
    get_hyperv_excluded_ranges,
    is_port_in_excluded_range,
    set_port_probe_reuse,
)

# 本次 launcher 启动的唯一标识
LAUNCH_ID = uuid.uuid4().hex
# 实例 ID：若父进程已设置则复用，否则生成新值，确保所有子进程共享同一实例标识
INSTANCE_ID = os.environ.get("NEKO_INSTANCE_ID") or uuid.uuid4().hex
os.environ.setdefault("NEKO_INSTANCE_ID", INSTANCE_ID)

# 确保本地服务间通信不走系统代理（防止 Clash/Surge 等代理软件拦截 localhost 请求）
# httpx 优先读小写 no_proxy，因此大小写都需要设置
# 使用精确 token 匹配，防止 "127.0.0.1" in "127.0.0.10" 这类子串误判
for _key in ("NO_PROXY", "no_proxy"):
    _no_proxy_raw = os.environ.get(_key, "")
    _tokens = set(map(str.strip, filter(None, _no_proxy_raw.split(","))))
    for _host in ("127.0.0.1", "localhost"):
        _tokens.add(_host)
    os.environ[_key] = ",".join(_tokens)

JOB_HANDLE = None
_cleanup_lock = threading.Lock()
_cleanup_done = False
_existing_neko_services: set[str] = set()  # 已有 N.E.K.O 实例占用的端口键
DEFAULT_PORTS = {
    "MAIN_SERVER_PORT": MAIN_SERVER_PORT,
    "MEMORY_SERVER_PORT": MEMORY_SERVER_PORT,
    "TOOL_SERVER_PORT": TOOL_SERVER_PORT,
}
INTERNAL_DEFAULT_PORTS = {
    "USER_PLUGIN_SERVER_PORT": 48916,
    "AGENT_MQ_PORT": 48917,
    "MAIN_AGENT_EVENT_PORT": 48918,
    "ZMQ_SESSION_PUB_PORT": 48961,
    "ZMQ_AGENT_PUSH_PORT": 48962,
    "ZMQ_ANALYZE_PUSH_PORT": 48963,
}
# 该区间保留给 N.E.K.O 已知默认端口，避免 fallback 与伴生服务冲突。
AVOID_FALLBACK_PORTS = set(range(48911, 48919)) | {48961, 48962, 48963}

# 模块名到端口键的映射（用于判断已有 N.E.K.O 实例是否占用对应端口）
MODULE_TO_PORT_KEY: dict[str, str] = {
    "memory_server": "MEMORY_SERVER_PORT",
    "agent_server": "TOOL_SERVER_PORT",
    "main_server": "MAIN_SERVER_PORT",
}


def _sync_runtime_config_globals(
    selected_public: dict[str, int] | None = None,
    selected_internal: dict[str, int] | None = None,
) -> None:
    """Keep the already-imported ``config`` module aligned with launcher choices.

    On Linux/macOS, ``multiprocessing`` defaults to ``fork``. Child processes then
    inherit the parent's already-imported ``config`` module object, so only writing
    ``os.environ`` is insufficient: any later ``from config import TOOL_SERVER_PORT``
    inside forked children would still see the stale pre-launcher values.

    Syncing the module globals here ensures forked children and modules imported
    after forking observe the negotiated runtime ports and shared instance id.
    """
    updates: dict[str, int | str] = {"INSTANCE_ID": INSTANCE_ID}
    if selected_public:
        updates.update(selected_public)
    if selected_internal:
        updates.update(selected_internal)

    for key, value in updates.items():
        setattr(config_module, key, value)


def _reload_runtime_config_from_env() -> None:
    """Reload ``config`` inside a child process and sync launcher globals.

    Even after the parent has updated ``config`` globals, a forked child can still
    inherit stale module state from any earlier imports. Reloading ``config`` from
    the negotiated ``NEKO_*`` environment variables gives each server process a
    fresh source of truth before importing its heavy application modules.
    """
    global INSTANCE_ID, MAIN_SERVER_PORT, MEMORY_SERVER_PORT, TOOL_SERVER_PORT

    reloaded = importlib.reload(config_module)
    INSTANCE_ID = str(reloaded.INSTANCE_ID)
    MAIN_SERVER_PORT = int(reloaded.MAIN_SERVER_PORT)
    MEMORY_SERVER_PORT = int(reloaded.MEMORY_SERVER_PORT)
    TOOL_SERVER_PORT = int(reloaded.TOOL_SERVER_PORT)
    _sync_runtime_config_globals(
        {
            "MAIN_SERVER_PORT": MAIN_SERVER_PORT,
            "MEMORY_SERVER_PORT": MEMORY_SERVER_PORT,
            "TOOL_SERVER_PORT": TOOL_SERVER_PORT,
        },
        {
            "USER_PLUGIN_SERVER_PORT": int(reloaded.USER_PLUGIN_SERVER_PORT),
            "AGENT_MQ_PORT": int(reloaded.AGENT_MQ_PORT),
            "MAIN_AGENT_EVENT_PORT": int(reloaded.MAIN_AGENT_EVENT_PORT),
        },
    )


def _install_logging_brace_compat() -> None:
    if getattr(logging, "_neko_brace_compat_installed", False):
        return

    original_get_message = logging.LogRecord.getMessage

    def _compat_get_message(record: logging.LogRecord) -> str:
        try:
            return original_get_message(record)
        except TypeError:
            msg = str(record.msg)
            args = record.args
            if not args or "%" in msg or "{" not in msg or "}" not in msg:
                raise
            try:
                if isinstance(args, dict):
                    return msg.format(**args)
                if not isinstance(args, tuple):
                    args = (args,)
                return msg.format(*args)
            except Exception:
                return f"{msg} | args={record.args!r}"

    logging.LogRecord.getMessage = _compat_get_message
    logging._neko_brace_compat_installed = True


_install_logging_brace_compat()


def _show_error_dialog(message: str):
    """在 Windows 打包场景显示错误弹窗。"""
    if sys.platform != 'win32':
        return
    try:
        ctypes.windll.user32.MessageBoxW(None, message, f"{APP_NAME} 启动失败", 0x10)
    except Exception:
        pass


def emit_frontend_event(event_type: str, payload: dict | None = None):
    """向 Electron stdout 发送机器可读事件。

    每个事件都带有 *launch_id*，前端可据此忽略历史（僵尸）进程事件。
    """
    envelope = {
        "source": "neko_launcher",
        "event": event_type,
        "ts": datetime.now(timezone.utc).isoformat(),
        "launch_id": LAUNCH_ID,
        "payload": payload or {},
    }
    print(f"NEKO_EVENT {json.dumps(envelope, ensure_ascii=True, separators=(',', ':'))}", flush=True)


def report_startup_failure(message: str, show_dialog: bool = True):
    """统一报告启动失败信息：终端 + （可选）弹窗。"""
    print(message, flush=True)
    emit_frontend_event("startup_failure", {"message": message})
    if show_dialog and getattr(sys, 'frozen', False):
        _show_error_dialog(message)


def _get_last_error() -> int:
    """获取最近一次 Win32 错误码。"""
    if sys.platform != 'win32':
        return 0
    return ctypes.windll.kernel32.GetLastError()


def setup_job_object():
    """
    创建 Windows Job Object 并将当前进程加入其中。
    设置 JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE 标志，
    这样当主进程被 kill 时，OS 会自动终止所有子进程，
    防止孤儿进程悬挂。
    """
    global JOB_HANDLE
    if sys.platform != 'win32':
        return None

    try:
        kernel32 = ctypes.windll.kernel32

        # Job Object 常量
        JOB_OBJECT_EXTENDED_LIMIT_INFORMATION = 9
        JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x2000

        # 先检查当前进程是否已在某个 Job 中（Steam 场景常见）
        is_in_job = ctypes.c_int(0)
        current_process = kernel32.GetCurrentProcess()
        if not kernel32.IsProcessInJob(current_process, None, ctypes.byref(is_in_job)):
            print(f"[Launcher] Warning: IsProcessInJob failed (err={_get_last_error()})", flush=True)
            is_in_job.value = 0

        # 创建 Job Object
        job = kernel32.CreateJobObjectW(None, None)
        if not job:
            print(f"[Launcher] Warning: Failed to create Job Object (err={_get_last_error()})", flush=True)
            return None

        # 设置 Job Object 信息
        # JOBOBJECT_EXTENDED_LIMIT_INFORMATION 结构体
        # 我们只需要设置 BasicLimitInformation.LimitFlags
        class JOBOBJECT_BASIC_LIMIT_INFORMATION(ctypes.Structure):
            _fields_ = [
                ('PerProcessUserTimeLimit', ctypes.c_int64),
                ('PerJobUserTimeLimit', ctypes.c_int64),
                ('LimitFlags', ctypes.c_uint32),
                ('MinimumWorkingSetSize', ctypes.c_size_t),
                ('MaximumWorkingSetSize', ctypes.c_size_t),
                ('ActiveProcessLimit', ctypes.c_uint32),
                ('Affinity', ctypes.c_size_t),
                ('PriorityClass', ctypes.c_uint32),
                ('SchedulingClass', ctypes.c_uint32),
            ]

        class IO_COUNTERS(ctypes.Structure):
            _fields_ = [
                ('ReadOperationCount', ctypes.c_uint64),
                ('WriteOperationCount', ctypes.c_uint64),
                ('OtherOperationCount', ctypes.c_uint64),
                ('ReadTransferCount', ctypes.c_uint64),
                ('WriteTransferCount', ctypes.c_uint64),
                ('OtherTransferCount', ctypes.c_uint64),
            ]

        class JOBOBJECT_EXTENDED_LIMIT_INFORMATION(ctypes.Structure):
            _fields_ = [
                ('BasicLimitInformation', JOBOBJECT_BASIC_LIMIT_INFORMATION),
                ('IoInfo', IO_COUNTERS),
                ('ProcessMemoryLimit', ctypes.c_size_t),
                ('JobMemoryLimit', ctypes.c_size_t),
                ('PeakProcessMemoryUsed', ctypes.c_size_t),
                ('PeakJobMemoryUsed', ctypes.c_size_t),
            ]

        info = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()
        info.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE

        result = kernel32.SetInformationJobObject(
            job,
            JOB_OBJECT_EXTENDED_LIMIT_INFORMATION,
            ctypes.byref(info),
            ctypes.sizeof(info)
        )
        if not result:
            print(f"[Launcher] Warning: Failed to set Job Object info (err={_get_last_error()})", flush=True)
            kernel32.CloseHandle(job)
            return None

        # 将当前进程加入 Job Object
        result = kernel32.AssignProcessToJobObject(job, current_process)
        if not result:
            err = _get_last_error()
            if is_in_job.value:
                print(
                    f"[Launcher] Warning: Process is already inside another Job; "
                    f"nested Job assignment failed (err={err}). "
                    "Will rely on explicit process-tree cleanup fallback.",
                    flush=True
                )
            else:
                print(f"[Launcher] Warning: Failed to assign process to Job Object (err={err})", flush=True)
            kernel32.CloseHandle(job)
            return None

        # 保持 handle 在进程生命周期内有效（模块级引用）
        # 进程退出时句柄会关闭，触发 KILL_ON_JOB_CLOSE
        JOB_HANDLE = job
        print("[Launcher] Job Object created - child processes will auto-terminate on exit", flush=True)
        return job

    except Exception as e:
        print(f"[Launcher] Warning: Job Object setup failed: {e}", flush=True)
        return None

# 服务器配置（按内存占用从轻到重排列，用于分步启动以降低峰值内存）
SERVERS = [
    {
        'name': 'Memory Server',
        'module': 'memory_server',
        'port': MEMORY_SERVER_PORT,
        'process': None,
        'ready_event': None,
    },
    {
        'name': 'Main Server',
        'module': 'main_server',
        'port': MAIN_SERVER_PORT,
        'process': None,
        'ready_event': None,
    },
    {
        'name': 'Agent Server',
        'module': 'agent_server',
        'port': TOOL_SERVER_PORT,
        'process': None,
        'ready_event': None,
    },
]

# 不再启动主程序，用户自己启动 lanlan_frd.exe

def run_memory_server(
    ready_event: Event,
    import_event: Event | None = None,
    shutdown_event: Event | None = None,
):
    """运行 Memory Server"""
    try:
        _reload_runtime_config_from_env()
        # 确保工作目录正确
        if getattr(sys, 'frozen', False):
            if hasattr(sys, '_MEIPASS'):
                # PyInstaller
                os.chdir(sys._MEIPASS)
            else:
                # Nuitka
                os.chdir(os.path.dirname(os.path.abspath(__file__)))
            # 禁用 typeguard（子进程需要重新禁用）
            try:
                import typeguard
                def dummy_typechecked(func=None, **kwargs):
                    return func if func else (lambda f: f)
                typeguard.typechecked = dummy_typechecked
                if hasattr(typeguard, '_decorators'):
                    typeguard._decorators.typechecked = dummy_typechecked
            except: # noqa
                pass

        import memory_server
        import uvicorn
        if import_event:
            import_event.set()

        print(f"[Memory Server] Starting on port {MEMORY_SERVER_PORT}")

        _behind_proxy = os.environ.get("NEKO_BEHIND_PROXY", "").strip().lower() in ("1", "true", "yes")
        # 使用 Server 对象，在启动后通知父进程
        config = uvicorn.Config(
            app=memory_server.app,
            host="127.0.0.1",
            port=MEMORY_SERVER_PORT,
            log_level="error",
            proxy_headers=_behind_proxy,
            forwarded_allow_ips="*" if _behind_proxy else None,
        )
        server = uvicorn.Server(config)

        if shutdown_event is not None:
            def _watch_shutdown() -> None:
                shutdown_event.wait()
                print("[Memory Server] Shutdown requested by launcher", flush=True)
                server.should_exit = True

            threading.Thread(target=_watch_shutdown, name="memory-shutdown-watch", daemon=True).start()

        # 在后台线程中运行服务器
        import asyncio

        async def run_with_notify():
            # 启动服务器
            await server.serve()

        # 启动线程来运行服务器，并在启动后通知
        def run_server():
            # 创建事件循环
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)

            # 添加启动完成的回调
            async def startup():
                print(f"[Memory Server] Running on port {MEMORY_SERVER_PORT}")
                ready_event.set()

            # 将 startup 添加到服务器的启动事件
            server.config.app.add_event_handler("startup", startup)

            # 运行服务器
            loop.run_until_complete(server.serve())

        run_server()

    except Exception as e:
        print(f"Memory Server error: {e}")
        import traceback
        traceback.print_exc()

def run_agent_server(
    ready_event: Event,
    import_event: Event | None = None,
    shutdown_event: Event | None = None,
):
    """运行 Agent Server (不需要等待初始化)"""
    try:
        _reload_runtime_config_from_env()
        # 确保工作目录正确
        if getattr(sys, 'frozen', False):
            if hasattr(sys, '_MEIPASS'):
                # PyInstaller
                os.chdir(sys._MEIPASS)
            else:
                # Nuitka
                os.chdir(os.path.dirname(os.path.abspath(__file__)))
            # 禁用 typeguard（子进程需要重新禁用）
            try:
                import typeguard
                def dummy_typechecked(func=None, **kwargs):
                    return func if func else (lambda f: f)
                typeguard.typechecked = dummy_typechecked
                if hasattr(typeguard, '_decorators'):
                    typeguard._decorators.typechecked = dummy_typechecked
            except: # noqa
                pass

        import agent_server
        import uvicorn
        if import_event:
            import_event.set()

        print(f"[Agent Server] Starting on port {TOOL_SERVER_PORT}")

        # Agent Server 不需要等待，立即通知就绪
        ready_event.set()

        _behind_proxy = os.environ.get("NEKO_BEHIND_PROXY", "").strip().lower() in ("1", "true", "yes")
        config = uvicorn.Config(
            app=agent_server.app,
            host="127.0.0.1",
            port=TOOL_SERVER_PORT,
            log_level="error",
            proxy_headers=_behind_proxy,
            forwarded_allow_ips="*" if _behind_proxy else None,
        )
        server = uvicorn.Server(config)

        if shutdown_event is not None:
            def _watch_shutdown() -> None:
                shutdown_event.wait()
                print("[Agent Server] Shutdown requested by launcher", flush=True)
                server.should_exit = True

            threading.Thread(target=_watch_shutdown, name="agent-shutdown-watch", daemon=True).start()

        server.run()
    except Exception as e:
        print(f"Agent Server error: {e}")
        import traceback
        traceback.print_exc()

def run_main_server(
    ready_event: Event,
    import_event: Event | None = None,
    shutdown_event: Event | None = None,
):
    """运行 Main Server"""
    try:
        _reload_runtime_config_from_env()
        # 确保工作目录正确
        if getattr(sys, 'frozen', False):
            if hasattr(sys, '_MEIPASS'):
                # PyInstaller
                os.chdir(sys._MEIPASS)
            else:
                # Nuitka
                os.chdir(os.path.dirname(os.path.abspath(__file__)))

        print("[Main Server] Importing main_server module...")
        import main_server
        import uvicorn
        if import_event:
            import_event.set()

        print(f"[Main Server] Starting on port {MAIN_SERVER_PORT}")

        _behind_proxy = os.environ.get("NEKO_BEHIND_PROXY", "").strip().lower() in ("1", "true", "yes")
        # 直接运行 FastAPI app，不依赖 main_server 的 __main__ 块
        config = uvicorn.Config(
            app=main_server.app,
            host="127.0.0.1",
            port=MAIN_SERVER_PORT,
            log_level="error",
            loop="asyncio",
            reload=False,
            proxy_headers=_behind_proxy,
            forwarded_allow_ips="*" if _behind_proxy else None,
        )
        server = uvicorn.Server(config)

        if shutdown_event is not None:
            def _watch_shutdown() -> None:
                shutdown_event.wait()
                print("[Main Server] Shutdown requested by launcher", flush=True)
                server.should_exit = True

            threading.Thread(target=_watch_shutdown, name="main-shutdown-watch", daemon=True).start()

        # 添加启动完成的回调
        async def startup():
            print(f"[Main Server] Running on port {MAIN_SERVER_PORT}")
            ready_event.set()

        # 将 startup 添加到服务器的启动事件
        main_server.app.add_event_handler("startup", startup)

        # 运行服务器
        server.run()
    except Exception as e:
        # 兜底崩溃日志：即使主日志系统未初始化，也能保留首个异常原因
        try:
            import traceback
            crash_file = os.path.join(os.getcwd(), "main_server_bootstrap_crash.log")
            with open(crash_file, "a", encoding="utf-8") as f:
                f.write("\n" + "=" * 80 + "\n")
                f.write(f"[{datetime.now().isoformat()}] Main Server bootstrap error: {e}\n")
                f.write(traceback.format_exc())
                f.write("\n")
        except Exception:
            pass
        print(f"Main Server error: {e}")
        import traceback
        traceback.print_exc()

def check_port(port: int, timeout: float = 0.5) -> bool:
    """检查端口是否已开放"""
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(timeout)
        result = sock.connect_ex(('127.0.0.1', port))
        sock.close()
        return result == 0
    except: # noqa
        return False


def get_port_owners(port: int) -> list[int]:
    """查询监听指定端口的进程 PID 列表（尽力而为）。"""
    pids: set[int] = set()
    try:
        if sys.platform == 'win32':
            result = subprocess.run(
                ["netstat", "-ano", "-p", "tcp"],
                capture_output=True,
                text=True,
                timeout=3,
                check=False,
            )
            needle = f":{port}"
            for raw in result.stdout.splitlines():
                line = raw.strip()
                if "LISTENING" not in line or needle not in line:
                    continue
                parts = line.split()
                if not parts:
                    continue
                pid_str = parts[-1]
                if pid_str.isdigit():
                    pids.add(int(pid_str))
        else:
            result = subprocess.run(
                ["lsof", "-nP", f"-iTCP:{port}", "-sTCP:LISTEN", "-t"],
                capture_output=True,
                text=True,
                timeout=3,
                check=False,
            )
            for line in result.stdout.splitlines():
                s = line.strip()
                if s.isdigit():
                    pids.add(int(s))
    except Exception:
        pass
    return sorted(pids)


def _is_port_bindable(port: int) -> bool:
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        set_port_probe_reuse(sock)
        sock.bind(("127.0.0.1", port))
        return True
    except OSError:
        return False
    finally:
        sock.close()


def _pick_fallback_port(preferred_port: int, reserved: set[int]) -> int | None:
    # 1) Prefer nearby ports first
    for port in range(preferred_port + 1, min(preferred_port + 101, 65535)):
        if port in reserved or port in AVOID_FALLBACK_PORTS:
            continue
        if _is_port_bindable(port):
            return port
    # 2) Fallback to any OS-assigned free port
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        set_port_probe_reuse(sock)
        sock.bind(("127.0.0.1", 0))
        port = int(sock.getsockname()[1])
        sock.close()
        if port not in reserved and port not in AVOID_FALLBACK_PORTS:
            return port
    except Exception:
        pass
    return None


def _classify_port_conflict(
    port: int,
    excluded_ranges: list[tuple[int, int]] | None = None,
) -> tuple[str, list]:
    """对端口不可用原因进行分类。

    返回 ``(reason, owners)``，其中 reason 为以下之一：
    - ``"neko"``            已有 N.E.K.O 服务占用
    - ``"hyperv_excluded"`` 位于 Hyper-V / WSL 保留端口范围
    - ``"other_process"``   被非 N.E.K.O 进程监听
    - ``"unknown"``         无法绑定但原因不明确
    owners 为监听该端口的进程 ID 列表。
    """
    health = probe_neko_health(port)
    if health is not None:
        return "neko", get_port_owners(port)
    # 将 excluded_ranges 解析一次，避免重复 netsh 子进程调用
    ranges = excluded_ranges if excluded_ranges is not None else get_hyperv_excluded_ranges()
    if is_port_in_excluded_range(port, ranges):
        return "hyperv_excluded", []
    owners = get_port_owners(port)
    if owners:
        return "other_process", owners
    return "unknown", []


def apply_port_strategy() -> bool | str:
    """优先使用默认端口，必要时自动规避冲突。

    返回值：
        ``True``      端口规划完成，可继续启动服务。
        ``False``     发生致命错误，需中止启动。
        ``"attach"`` 默认端口已由现有 N.E.K.O 后端完整占用。

    策略：
    1. 默认端口若已是 N.E.K.O 服务，则视为可复用。
    2. 若被 Hyper-V/WSL 保留或其他进程占用，则选择 fallback 端口。
    """
    global MAIN_SERVER_PORT, MEMORY_SERVER_PORT, TOOL_SERVER_PORT
    chosen: dict[str, int] = {}
    chosen_internal: dict[str, int] = {}
    fallback_details: list[dict] = []
    internal_fallback_details: list[dict] = []
    reserved: set[int] = set()

    # 预先查询 Hyper-V 保留端口范围，避免重复子进程调用
    excluded_ranges = get_hyperv_excluded_ranges()
    if excluded_ranges:
        print(f"[Launcher] Detected {len(excluded_ranges)} Hyper-V/WSL excluded port range(s)", flush=True)

    for key in ("MEMORY_SERVER_PORT", "TOOL_SERVER_PORT", "MAIN_SERVER_PORT"):
        preferred = int(DEFAULT_PORTS[key])
        if preferred not in reserved and _is_port_bindable(preferred):
            chosen[key] = preferred
            reserved.add(preferred)
            continue

        # 端口不可绑定，识别具体原因（同时获取 owners 避免重复查询）
        reason, owners = _classify_port_conflict(preferred, excluded_ranges)

        if reason == "neko":
            # 已有 N.E.K.O 实例占用该端口。
            # 仍记录为 chosen，并打标记供前端决定“附加复用”而非“重复拉起”。
            chosen[key] = preferred
            reserved.add(preferred)
            fallback_details.append(
                {
                    "port_key": key,
                    "preferred": preferred,
                    "selected": preferred,
                    "reason": "existing_neko",
                    "owners": owners,
                }
            )
            continue

        # 需要选择回退端口
        fallback = _pick_fallback_port(preferred, reserved)
        if fallback is None:
            report_startup_failure(
                f"Startup failed: no fallback port available for {key} "
                f"(preferred={preferred}, reason={reason}, owners={owners})"
            )
            return False

        chosen[key] = fallback
        reserved.add(fallback)
        fallback_details.append(
            {
                "port_key": key,
                "preferred": preferred,
                "selected": fallback,
                "reason": reason,
                "owners": owners,
            }
        )

    MAIN_SERVER_PORT = chosen["MAIN_SERVER_PORT"]
    MEMORY_SERVER_PORT = chosen["MEMORY_SERVER_PORT"]
    TOOL_SERVER_PORT = chosen["TOOL_SERVER_PORT"]

    for key, preferred in INTERNAL_DEFAULT_PORTS.items():
        if preferred not in reserved and _is_port_bindable(preferred):
            chosen_internal[key] = preferred
            reserved.add(preferred)
            continue

        owners = get_port_owners(preferred)
        fallback = _pick_fallback_port(preferred, reserved)
        if fallback is None:
            report_startup_failure(
                f"Startup failed: no fallback port available for {key} (preferred={preferred}, owners={owners})"
            )
            return False

        chosen_internal[key] = fallback
        reserved.add(fallback)
        internal_fallback_details.append(
            {
                "port_key": key,
                "preferred": preferred,
                "selected": fallback,
                "owners": owners,
            }
        )

    for key, value in chosen.items():
        os.environ[f"NEKO_{key}"] = str(value)
    for key, value in chosen_internal.items():
        os.environ[f"NEKO_{key}"] = str(value)

    _sync_runtime_config_globals(chosen, chosen_internal)

    for server in SERVERS:
        if server["module"] == "memory_server":
            server["port"] = MEMORY_SERVER_PORT
        elif server["module"] == "agent_server":
            server["port"] = TOOL_SERVER_PORT
        elif server["module"] == "main_server":
            server["port"] = MAIN_SERVER_PORT

    emit_frontend_event(
        "port_plan",
        {
            "instance_id": INSTANCE_ID,
            "defaults": DEFAULT_PORTS,
            "selected": chosen,
            "internal_defaults": INTERNAL_DEFAULT_PORTS,
            "internal_selected": chosen_internal,
            "fallbacks": fallback_details,
            "internal_fallbacks": internal_fallback_details,
            "fallback_applied": bool(fallback_details or internal_fallback_details),
        },
    )

    # 检查默认端口是否全部由既有 N.E.K.O 占用（existing_neko）。
    # 若是，则 launcher 不应继续拉起新服务。
    existing_neko_keys = {
        d["port_key"]
        for d in fallback_details
        if d.get("reason") == "existing_neko"
    }

    # 记录已存在实例的服务端口键，供 start_server() 跳过重复启动。
    global _existing_neko_services
    _existing_neko_services = existing_neko_keys

    if existing_neko_keys == set(DEFAULT_PORTS.keys()):
        # 默认端口上的完整 N.E.K.O 后端已在运行。
        emit_frontend_event(
            "attach_existing",
            {
                "selected": chosen,
                "message": "All default ports occupied by an existing N.E.K.O backend",
            },
        )
        print("[Launcher] Existing N.E.K.O backend detected on all default ports; attaching.", flush=True)
        return "attach"

    # 区分“复用已有实例”与“真正端口回退”的日志
    real_fallbacks = [d for d in fallback_details if d.get("reason") != "existing_neko"]
    if real_fallbacks or internal_fallback_details:
        print(
            f"[Launcher] Port fallback applied: public={real_fallbacks}, internal={internal_fallback_details}",
            flush=True,
        )
    elif existing_neko_keys:
        print(
            f"[Launcher] Ports reused from existing N.E.K.O instance: {sorted(existing_neko_keys)}",
            flush=True,
        )
    else:
        print("[Launcher] Preferred ports available; no fallback needed.", flush=True)
    return True

def show_spinner(stop_event: threading.Event, message: str = "正在启动服务器"):
    """显示转圈圈动画"""
    spinner = itertools.cycle(['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'])
    while not stop_event.is_set():
        sys.stdout.write(f'\r{message}... {next(spinner)} ')
        sys.stdout.flush()
        time.sleep(0.1)
    sys.stdout.write('\r' + ' ' * 60 + '\r')  # 清除动画行
    sys.stdout.write('\n')  # 换行，确保后续输出在新行
    sys.stdout.flush()

def start_server(server: Dict) -> bool:
    """启动单个服务器"""
    try:
        port = server.get('port')

        port_key = MODULE_TO_PORT_KEY.get(server['module'])

        # If this service's port already has a running N.E.K.O instance,
        # skip launching (the existing process will serve requests).
        if port_key and port_key in _existing_neko_services:
            print(f"✓ {server['name']} already running on port {port} (existing N.E.K.O instance)", flush=True)
            server['ready_event'] = Event()
            server['ready_event'].set()  # Mark as ready immediately
            return True

        if isinstance(port, int) and check_port(port):
            owner_pids = get_port_owners(port)
            owner_suffix = f", owner_pids={owner_pids}" if owner_pids else ""
            report_startup_failure(f"Start failed: {server['name']} port {port} already in use{owner_suffix}")
            return False

        # 根据模块名选择启动函数
        if server['module'] == 'memory_server':
            target_func = run_memory_server
        elif server['module'] == 'agent_server':
            target_func = run_agent_server
        elif server['module'] == 'main_server':
            target_func = run_main_server
        else:
            report_startup_failure(f"Start failed: {server['name']} has unknown module")
            return False

        # 创建进程间同步事件
        server['ready_event'] = Event()
        server['import_event'] = Event()
        server['shutdown_event'] = Event()

        # 使用 multiprocessing 启动服务器
        # 注意：不能设置 daemon=True，因为 main_server 自己会创建子进程
        server['process'] = Process(
            target=target_func,
            args=(server['ready_event'], server['import_event'], server['shutdown_event']),
            daemon=False,
        )
        server['process'].start()

        print(f"✓ {server['name']} 已启动 (PID: {server['process'].pid})", flush=True)
        return True
    except Exception as e:
        report_startup_failure(f"Start failed: {server['name']} exception: {e}")
        return False

def wait_for_servers(timeout: int = 60) -> bool:
    """等待所有服务器启动完成"""
    print("\n等待服务器准备就绪...", flush=True)

    # 启动动画线程
    stop_spinner = threading.Event()
    spinner_thread = threading.Thread(target=show_spinner, args=(stop_spinner, "检查服务器状态"))
    spinner_thread.daemon = True
    spinner_thread.start()

    start_time = time.time()
    all_ready = False

    # 第一步：等待所有端口就绪
    while time.time() - start_time < timeout:
        # 若某个子进程提前退出，立即报错而不是等到超时
        for server in SERVERS:
            proc = server.get('process')
            if proc is not None and not proc.is_alive() and not check_port(server['port']):
                report_startup_failure(
                    f"Startup failed: {server['name']} exited early (exitcode={proc.exitcode})"
                )
                stop_spinner.set()
                spinner_thread.join()
                return False

        ready_count = 0
        for server in SERVERS:
            if check_port(server['port']):
                ready_count += 1

        if ready_count == len(SERVERS):
            break

        time.sleep(0.5)

    # 第二步：等待所有服务器的 ready_event（同步初始化完成）
    if ready_count == len(SERVERS):
        for server in SERVERS:
            remaining_time = timeout - (time.time() - start_time)
            if remaining_time > 0:
                if server['ready_event'].wait(timeout=remaining_time):
                    continue
                else:
                    # 超时
                    break
        else:
            # 所有服务器都就绪了
            all_ready = True

    # 停止动画
    stop_spinner.set()
    spinner_thread.join()

    if all_ready:
        print("\n", flush=True)
        print("=" * 60, flush=True)
        print("✓✓✓  所有服务器已准备就绪！  ✓✓✓", flush=True)
        print("=" * 60, flush=True)
        print("\n", flush=True)
        return True
    else:
        print("\n", flush=True)
        print("=" * 60, flush=True)
        print("✗ 服务器启动超时，请检查日志文件", flush=True)
        print("=" * 60, flush=True)
        print("\n", flush=True)
        report_startup_failure("Startup timeout: at least one service did not become ready")
        # 显示未就绪的服务器
        for server in SERVERS:
            if not server['ready_event'].is_set():
                print(f"  - {server['name']} 初始化未完成", flush=True)
            elif not check_port(server['port']):
                print(f"  - {server['name']} 端口 {server['port']} 未就绪", flush=True)
        return False