N.E.K.O/launcher.py at main · MingTianSang/N.E.K.O · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
N.E.K.O. 统一启动器
启动所有服务器，等待它们准备就绪后启动主程序，并监控主程序状态
"""
from __future__ import annotations

import sys
import os
import io
import signal

def _configure_stdio_utf8() -> None:
    """Normalize stdio encoding when running the launcher on Windows.

    优先 stream.reconfigure（保留 stream 对象），失败再兜底换 TextIOWrapper。
    保留原对象是为了兼容 pytest capture / IDE 控制台 / 其他 embedded host —
    替换 sys.stdout 会断掉这些上游的 redirector。
    """
    if sys.platform != 'win32':
        return

    for name in ('stdout', 'stderr'):
        stream = getattr(sys, name, None)
        if stream is None:
            continue
        try:
            reconfigure = getattr(stream, 'reconfigure', None)
            if callable(reconfigure):
                reconfigure(encoding='utf-8', errors='replace')
        except Exception:
            pass


# 模块级立即 reconfigure 一次：即使 launcher 被作为 module import（比如
# tests/unit/test_cloudsave_startup_flow.py 里 8 处 import launcher），也
# 能保证 Windows 下中文 log 不崩。stream.reconfigure 幂等，
# _bootstrap_launcher_runtime 里再调一次只是 no-op。
_configure_stdio_utf8()


# 检测打包环境（PyInstaller 设 sys.frozen，Nuitka 设 __compiled__）
IS_FROZEN = getattr(sys, 'frozen', False) or '__compiled__' in globals()

# 处理 PyInstaller 和 Nuitka 打包后的路径
if IS_FROZEN:
    # 运行在打包后的环境
    if hasattr(sys, '_MEIPASS'):
        # PyInstaller
        bundle_dir = sys._MEIPASS
    else:
        # Nuitka 或其他
        bundle_dir = os.path.dirname(os.path.abspath(__file__))
    # tiktoken encodings (e.g. o200k_base) load merge tables from TIKTOKEN_CACHE_DIR;
    # build_nuitka.bat pre-fetches into data/tiktoken_cache for offline use.
    _tiktoken_cache = os.path.join(bundle_dir, "data", "tiktoken_cache")
    if os.path.isdir(_tiktoken_cache):
        os.environ.setdefault("TIKTOKEN_CACHE_DIR", _tiktoken_cache)
else:
    # 运行在正常 Python 环境
    bundle_dir = os.path.dirname(os.path.abspath(__file__))


def _configure_ssl_cert_bundle() -> None:
    """仅在冻结发行版里把 certifi 的 CA bundle 显式喂给 OpenSSL。

    Nuitka / PyInstaller 会复制 `libssl`，但其编译期硬编码的 OPENSSLDIR 指向
    构建机路径，用户机上不存在；如果同时没设 SSL_CERT_FILE 环境变量，
    `ssl.create_default_context()` 拿不到任何根证书，所有外部 TLS 一律失败。
    build-desktop.yml 已经把 `certifi/cacert.pem` 当 package data 打进去，
    这里只是把它显式指给 OpenSSL。

    源码模式下**不动** SSL_CERT_FILE：系统 Python 的 OpenSSL 默认信任链是
    OS / venv 在用的那一份，可能挂着企业私有 CA（公司 TLS 中间人代理、
    内部 PKI 等），certifi 静态 bundle 里没有这些根，硬覆盖会让原本能通
    的内网 HTTPS 突然报 `certificate verify failed`。打包发行版没这层风险
    （libssl 的 OPENSSLDIR 本身就指不到任何东西），所以只在 IS_FROZEN
    分支里兜底。

    用户已显式设过任一变量且文件存在时，无论是否冻结都尊重原值；只覆盖
    那些缺失或指向已不存在路径的变量（比如打包构建机继承下来的失效路径）。
    """
    var_names = ("SSL_CERT_FILE", "REQUESTS_CA_BUNDLE", "CURL_CA_BUNDLE")

    def _existing_is_valid(name: str) -> bool:
        value = os.environ.get(name)
        if not value:
            return False
        # 各变量对"有效"的定义不同：
        # - REQUESTS_CA_BUNDLE: requests 文档明确允许 PEM 文件 *或* c_rehash
        #   过的 CA 目录（capath 模式），把目录当失效值覆盖会破坏企业 PKI 的
        #   capath 配置。
        # - SSL_CERT_FILE / CURL_CA_BUNDLE: OpenSSL / curl 都只接受 PEM 文件，
        #   目录由各自的 SSL_CERT_DIR / CURL_CA_PATH 单独表达。
        if name == "REQUESTS_CA_BUNDLE":
            return os.path.isfile(value) or os.path.isdir(value)
        return os.path.isfile(value)

    # 三个变量都已经指向有效文件 → 完全不动。
    if all(_existing_is_valid(name) for name in var_names):
        return

    # 源码模式：保持系统默认信任链，不强行换 certifi（避免破坏企业 CA 场景）。
    # 即便某个变量目前指向失效路径，源码模式也由用户/上游脚本负责修——我们
    # 没法区分"用户故意指向坏路径调试"和"误继承坏路径"。
    if not IS_FROZEN:
        return

    ca_path: str | None = None
    try:
        import certifi  # noqa: WPS433 — 故意放在函数内，保持模块导入开销可控
        candidate = certifi.where()
        if candidate and os.path.isfile(candidate):
            ca_path = candidate
    except Exception:
        ca_path = None

    if ca_path is None:
        # 冻结环境兜底：build-desktop.yml 把 certifi/cacert.pem 落到 bundle_dir 下；
        # PyInstaller onefile 模式下 bundle_dir == sys._MEIPASS（见文件顶部
        # IS_FROZEN 分支），所以这一份候选覆盖了主流冻结布局。
        candidate = os.path.join(bundle_dir, "certifi", "cacert.pem")
        if os.path.isfile(candidate):
            ca_path = candidate

    if ca_path is None:
        # 冻结环境下找不到任何 CA bundle —— 外网 TLS 注定挂，给运维一个明确的
        # 根因提示，避免下游只看到二手的 "certificate verify failed"。
        print(
            "[Launcher] Warning: failed to locate CA bundle in frozen build "
            f"(certifi.where() unavailable, no certifi/cacert.pem under {bundle_dir}); "
            "external HTTPS / WSS will fail with certificate verify failed.",
            flush=True,
        )
        return

    # 每个失效变量按"自身语义最贴近的 fallback 顺序"挑来源，保持各库自己
    # 的查找语义不变；都没拿到再用 certifi 兜底。
    #
    # 关键场景：用户故意分流 SSL_CERT_FILE=/etc/openssl.pem 给 OpenSSL、
    # CURL_CA_BUNDLE=/etc/curl.pem 给 curl/requests，没设 REQUESTS_CA_BUNDLE
    # 想让 requests 走文档里的 fallback (REQUESTS → CURL → default)。如果
    # 我们对所有失效变量都 break 在第一个找到的有效文件（顺序为 SSL → REQUESTS
    # → CURL），REQUESTS_CA_BUNDLE 会被错填成 SSL 的 PEM，requests 看不到
    # 用户预期的 CURL_CA_BUNDLE，HTTPS 行为偏离文档。
    #
    # 偏好顺序设计依据：
    # - SSL_CERT_FILE: OpenSSL 没 documented fallback，但 REQUESTS / CURL 的
    #   PEM 都是 OpenSSL 兼容文件，任选其一无大差异；REQUESTS 排前因为更可能
    #   是用户业务侧的 trust bundle，CURL 排后留给系统级 curl 配置。
    # - REQUESTS_CA_BUNDLE: requests 文档明确 fallback 到 CURL_CA_BUNDLE，
    #   所以 CURL 必须排第一；SSL 作为最后兜底（仍是有效 PEM）。
    # - CURL_CA_BUNDLE: curl 没 documented fallback，按"系统全局信任 → 业务
    #   信任"的直觉：SSL 排前，REQUESTS 兜底。
    #
    # 只看 file：REQUESTS_CA_BUNDLE 允许的目录（capath）不能喂给 OpenSSL /
    # curl，跨变量传播一律走文件。
    propagation_sources = {
        "SSL_CERT_FILE": ("REQUESTS_CA_BUNDLE", "CURL_CA_BUNDLE"),
        "REQUESTS_CA_BUNDLE": ("CURL_CA_BUNDLE", "SSL_CERT_FILE"),
        "CURL_CA_BUNDLE": ("SSL_CERT_FILE", "REQUESTS_CA_BUNDLE"),
    }

    def _pick_fallback(target: str) -> str:
        for src in propagation_sources[target]:
            value = os.environ.get(src)
            if value and os.path.isfile(value):
                return value
        return ca_path

    # 三个变量统一处理：已存在且有效 → 保留；否则 → 按 propagation_sources
    # 顺序找一个有效 PEM 文件填，找不到才用 certifi。`setdefault` 不够：
    # 继承自打包构建机 / 旧路径的失效值会让 requests / curl 仍然报 verify
    # failed，本函数要避免的恰恰就是这个症状。
    for name in var_names:
        if not _existing_is_valid(name):
            os.environ[name] = _pick_fallback(name)


# 必须在任何会触发 `import ssl` 的模块之前执行；Python 的 ssl 模块在第一次
# import 时就会通过 OpenSSL 把默认 verify paths 锁住，之后再设环境变量
# 对已有 SSLContext 不生效。下面 from utils.* import ... 已经会拉起 httpx /
# openai SDK 链路，所以这里抢在前面跑。
#
# 用显式判断而非 `assert`：`python -O` 会剥离 assert，把检查变成静默通过。
# 这里希望任何在本函数之前 import ssl 的回归都能被运维直接看到。
if "ssl" in sys.modules:
    print(
        "[Launcher] Warning: `ssl` was imported before _configure_ssl_cert_bundle() ran; "
        "SSL_CERT_FILE override won't affect the already-initialized default SSLContext. "
        "Move SSL bootstrap higher in launcher.py.",
        flush=True,
    )
_configure_ssl_cert_bundle()


def _get_project_venv_python(project_dir: str) -> str | None:
    if sys.platform == 'win32':
        candidate = os.path.join(project_dir, '.venv', 'Scripts', 'python.exe')
    else:
        candidate = os.path.join(project_dir, '.venv', 'bin', 'python')

    return candidate if os.path.exists(candidate) else None


def _maybe_reexec_into_project_venv(project_dir: str) -> None:
    """Prefer the repo-local virtualenv when launching from source.

    Users often invoke ``python launcher.py`` with the system interpreter.
    When that interpreter differs from the project's managed ``.venv``,
    imports fail even though the dependency is already installed locally.
    """
    if IS_FROZEN:
        return

    # 获取预期的 .venv 目录和当前环境的根目录
    expected_venv_dir = os.path.abspath(os.path.join(project_dir, ".venv"))
    current_venv_dir = os.path.abspath(sys.prefix)

    # 校验当前环境是否真的是本项目的 .venv（忽略大小写差异）
    # 这样既能兼容 uv run，又能防止在其他无关虚拟环境中误跑此脚本导致报错
    if os.path.normcase(current_venv_dir) == os.path.normcase(expected_venv_dir):
        return

    # 如果根目录不匹配，再进行原有的解释器路径严格校验
    current_executable = os.path.abspath(sys.executable or "")
    if not current_executable:
        return

    candidate = _get_project_venv_python(project_dir)
    if not candidate:
        return

    target_executable = os.path.abspath(candidate)
    if current_executable == target_executable:
        return

    print(f"[Launcher] 当前解释器不是项目虚拟环境，正在切换到: {candidate}")
    os.execv(target_executable, [target_executable] + sys.argv)

import subprocess
import socket
import time
import threading
import itertools
import ctypes
import atexit
import signal
import json
import logging
import uuid
import importlib
import multiprocessing
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict
from multiprocessing import Process, freeze_support, Event
import config as config_module
from config import APP_NAME, MAIN_SERVER_PORT, MEMORY_SERVER_PORT, TOOL_SERVER_PORT
from utils.port_utils import (
    probe_neko_health,
    acquire_startup_lock,
    release_startup_lock,
    get_hyperv_excluded_ranges,
    is_port_in_excluded_range,
    set_port_probe_reuse,
)
from utils.cloudsave_runtime import (
    ROOT_MODE_BOOTSTRAP_IMPORTING,
    ROOT_MODE_MAINTENANCE_READONLY,
    ROOT_MODE_NORMAL,
    bootstrap_local_cloudsave_environment,
    cloud_apply_fence,
    set_root_mode,
    should_write_root_mode_normal_after_startup,
)
from utils.cloudsave_autocloud import get_cloudsave_manager
from utils.config_manager import get_config_manager, reset_config_manager_cache
from utils.storage_layout import clear_storage_layout_env, export_storage_layout_to_env, resolve_storage_layout
from utils.storage_migration import run_pending_storage_migration
from utils.storage_policy import paths_equal


def _configure_multiprocessing_executable(project_dir: str) -> None:
    """Force macOS/Windows spawn children to reuse the project virtualenv."""
    if IS_FROZEN:
        return

    candidate = _get_project_venv_python(project_dir)
    if not candidate:
        return

    try:
        multiprocessing.set_executable(os.path.abspath(candidate))
    except Exception as exc:
        print(f"[Launcher] Warning: failed to pin multiprocessing executable: {exc}", flush=True)


# 本次 launcher 启动的唯一标识
LAUNCH_ID = ""
# 实例 ID：在显式启动路径中初始化，确保导入模块时不改动进程环境
INSTANCE_ID = ""

JOB_HANDLE = None
_cleanup_lock = threading.Lock()
_cleanup_done = False
_expected_launcher_shutdown = False
_existing_neko_services: set[str] = set()  # 已有 N.E.K.O 实例占用的端口键
DEFAULT_PORTS = {
    "MAIN_SERVER_PORT": MAIN_SERVER_PORT,
    "MEMORY_SERVER_PORT": MEMORY_SERVER_PORT,
    "TOOL_SERVER_PORT": TOOL_SERVER_PORT,
}
INTERNAL_DEFAULT_PORTS = {
    "USER_PLUGIN_SERVER_PORT": 48916,
    "AGENT_MQ_PORT": 48917,
    "MAIN_AGENT_EVENT_PORT": 48918,
    "ZMQ_SESSION_PUB_PORT": 48961,
    "ZMQ_AGENT_PUSH_PORT": 48962,
    "ZMQ_ANALYZE_PUSH_PORT": 48963,
}
# 该区间保留给 N.E.K.O 已知默认端口，避免 fallback 与伴生服务冲突。
AVOID_FALLBACK_PORTS = set(range(48911, 48919)) | {48961, 48962, 48963}

# 模块名到端口键的映射（用于判断已有 N.E.K.O 实例是否占用对应端口）
MODULE_TO_PORT_KEY: dict[str, str] = {
    "memory_server": "MEMORY_SERVER_PORT",
    "agent_server": "TOOL_SERVER_PORT",
    "main_server": "MAIN_SERVER_PORT",
}
SHUTDOWN_MODULE_ORDER = (
    "main_server",
    "memory_server",
    "agent_server",
)


def _sync_runtime_config_globals(
    selected_public: dict[str, int] | None = None,
    selected_internal: dict[str, int] | None = None,
) -> None:
    """Keep the already-imported ``config`` module aligned with launcher choices.

    On Linux, ``multiprocessing`` often defaults to ``fork`` while macOS/Windows
    commonly use ``spawn``. Either way, only writing ``os.environ`` is not enough:
    forked children can inherit the parent's already-imported ``config`` module
    object, and spawned children can still observe stale globals if imports happen
    before launcher-selected overrides are reloaded.

    Syncing the module globals here ensures forked children and modules imported
    after forking observe the negotiated runtime ports and shared instance id.
    """
    updates: dict[str, int | str] = {"INSTANCE_ID": INSTANCE_ID}
    if selected_public:
        updates.update(selected_public)
    if selected_internal:
        updates.update(selected_internal)

    for key, value in updates.items():
        setattr(config_module, key, value)


def _reload_runtime_config_from_env() -> None:
    """Reload ``config`` inside a child process and sync launcher globals.

    Even after the parent has updated ``config`` globals, a forked child can still
    inherit stale module state from any earlier imports. Reloading ``config`` from
    the negotiated ``NEKO_*`` environment variables gives each server process a
    fresh source of truth before importing its heavy application modules.
    """
    global INSTANCE_ID, MAIN_SERVER_PORT, MEMORY_SERVER_PORT, TOOL_SERVER_PORT

    reloaded = importlib.reload(config_module)
    INSTANCE_ID = str(reloaded.INSTANCE_ID)
    MAIN_SERVER_PORT = int(reloaded.MAIN_SERVER_PORT)
    MEMORY_SERVER_PORT = int(reloaded.MEMORY_SERVER_PORT)
    TOOL_SERVER_PORT = int(reloaded.TOOL_SERVER_PORT)
    _sync_runtime_config_globals(
        {
            "MAIN_SERVER_PORT": MAIN_SERVER_PORT,
            "MEMORY_SERVER_PORT": MEMORY_SERVER_PORT,
            "TOOL_SERVER_PORT": TOOL_SERVER_PORT,
        },
        {
            "USER_PLUGIN_SERVER_PORT": int(reloaded.USER_PLUGIN_SERVER_PORT),
            "AGENT_MQ_PORT": int(reloaded.AGENT_MQ_PORT),
            "MAIN_AGENT_EVENT_PORT": int(reloaded.MAIN_AGENT_EVENT_PORT),
        },
    )


def _install_logging_brace_compat() -> None:
    if getattr(logging, "_neko_brace_compat_installed", False):
        return

    original_get_message = logging.LogRecord.getMessage

    def _compat_get_message(record: logging.LogRecord) -> str:
        try:
            return original_get_message(record)
        except TypeError:
            msg = str(record.msg)
            args = record.args
            if not args or "%" in msg or "{" not in msg or "}" not in msg:
                raise
            try:
                if isinstance(args, dict):
                    return msg.format(**args)
                if not isinstance(args, tuple):
                    args = (args,)
                return msg.format(*args)
            except Exception:
                return f"{msg} | args={record.args!r}"

    logging.LogRecord.getMessage = _compat_get_message
    logging._neko_brace_compat_installed = True


def _initialize_launcher_context() -> None:
    """Populate per-launch ids and env only during explicit launcher startup."""
    global LAUNCH_ID, INSTANCE_ID

    if not LAUNCH_ID:
        LAUNCH_ID = uuid.uuid4().hex

    if not INSTANCE_ID:
        INSTANCE_ID = os.environ.get("NEKO_INSTANCE_ID") or uuid.uuid4().hex
        os.environ.setdefault("NEKO_INSTANCE_ID", INSTANCE_ID)
        _sync_runtime_config_globals()

    # 确保本地服务间通信不走系统代理（防止 Clash/Surge 等代理软件拦截 localhost 请求）
    # httpx 优先读小写 no_proxy，因此大小写都需要设置
    # 使用精确 token 匹配，防止 "127.0.0.1" in "127.0.0.10" 这类子串误判
    for _key in ("NO_PROXY", "no_proxy"):
        _no_proxy_raw = os.environ.get(_key, "")
        _tokens = set(map(str.strip, filter(None, _no_proxy_raw.split(","))))
        for _host in ("127.0.0.1", "localhost"):
            _tokens.add(_host)
        os.environ[_key] = ",".join(_tokens)


def _bootstrap_launcher_runtime(project_dir: str) -> None:
    """Run launcher bootstrap only from the explicit startup path."""
    _configure_stdio_utf8()
    _maybe_reexec_into_project_venv(project_dir)
    if project_dir not in sys.path:
        sys.path.insert(0, project_dir)
    os.chdir(project_dir)
    _configure_multiprocessing_executable(project_dir)
    _install_logging_brace_compat()
    _initialize_launcher_context()


def _show_error_dialog(message: str):
    """在 Windows 打包场景显示错误弹窗。"""
    if sys.platform != 'win32':
        return
    try:
        ctypes.windll.user32.MessageBoxW(None, message, f"{APP_NAME} 启动失败", 0x10)
    except Exception:
        pass


def emit_frontend_event(event_type: str, payload: dict | None = None):
    """向 Electron stdout 发送机器可读事件。

    每个事件都带有 *launch_id*，前端可据此忽略历史（僵尸）进程事件。
    """
    envelope = {
        "source": "neko_launcher",
        "event": event_type,
        "ts": datetime.now(timezone.utc).isoformat(),
        "launch_id": LAUNCH_ID,
        "payload": payload or {},
    }
    print(f"NEKO_EVENT {json.dumps(envelope, ensure_ascii=True, separators=(',', ':'))}", flush=True)


def _resolve_storage_layout_for_launch() -> dict:
    clear_storage_layout_env()
    reset_config_manager_cache()
    config_manager = get_config_manager(APP_NAME, migrate=False)

    try:
        migration_result = run_pending_storage_migration(config_manager)
    except Exception as exc:
        print(f"[Launcher] Warning: pending storage migration processing failed: {exc}", flush=True)
        migration_result = {
            "attempted": False,
            "completed": False,
            "error_message": str(exc),
        }

    reset_config_manager_cache()
    resolved_config_manager = get_config_manager(APP_NAME, migrate=False)
    layout = resolve_storage_layout(resolved_config_manager)
    export_storage_layout_to_env(layout)
    reset_config_manager_cache()
    return {
        "layout": layout,
        "migration_result": migration_result,
    }


def _build_launcher_relaunch_command() -> list[str]:
    if IS_FROZEN:
        return [sys.executable, *sys.argv[1:]]
    return [sys.executable, os.path.abspath(__file__), *sys.argv[1:]]


def _should_detach_stdio_for_relaunch() -> bool:
    for stream_name in ("stdin", "stdout", "stderr"):
        stream = getattr(sys, stream_name, None)
        isatty = getattr(stream, "isatty", None)
        if callable(isatty):
            try:
                if isatty():
                    return True
            except Exception:
                continue
    return False


def _spawn_restarted_launcher() -> None:
    command = _build_launcher_relaunch_command()
    relaunch_env = os.environ.copy()
    # ``main_server`` uses this marker only to suppress duplicate module-level
    # init within the *current* Python process tree (mainly Windows spawn).
    # A storage-location relaunch is a brand-new launcher instance and must
    # re-run full startup initialization, so we must not inherit the marker.
    relaunch_env.pop("_NEKO_MAIN_SERVER_INITIALIZED", None)
    kwargs: dict[str, object] = {
        "cwd": os.getcwd(),
        "env": relaunch_env,
        "close_fds": True,
    }
    if _should_detach_stdio_for_relaunch():
        kwargs["stdin"] = subprocess.DEVNULL
        kwargs["stdout"] = subprocess.DEVNULL
        kwargs["stderr"] = subprocess.DEVNULL
    if sys.platform == "win32":
        creationflags = 0
        creationflags |= int(getattr(subprocess, "CREATE_NEW_PROCESS_GROUP", 0))
        creationflags |= int(getattr(subprocess, "DETACHED_PROCESS", 0))
        if creationflags:
            kwargs["creationflags"] = creationflags
    else:
        kwargs["start_new_session"] = True
    subprocess.Popen(command, **kwargs)


def _mark_expected_launcher_shutdown() -> None:
    global _expected_launcher_shutdown
    _expected_launcher_shutdown = True


def _is_expected_launcher_shutdown() -> bool:
    return bool(_expected_launcher_shutdown)


STARTUP_WAIT_RESULT_STORAGE_RESTART = "storage_restart_requested"


def _is_pending_storage_restart_request() -> bool:
    try:
        config_manager = get_config_manager(APP_NAME, migrate=False)
        load_root_state = getattr(config_manager, "load_root_state", None)
        if not callable(load_root_state):
            return False

        root_state = load_root_state()
        if not isinstance(root_state, dict):
            return False

        root_mode = str(root_state.get("mode") or "").strip()
        last_migration_result = str(root_state.get("last_migration_result") or "").strip()
        if root_mode != ROOT_MODE_MAINTENANCE_READONLY:
            return False

        return last_migration_result.startswith(("restart_pending:", "restart_rebind:"))
    except Exception as exc:
        print(f"[Launcher] Warning: failed to inspect storage restart intent: {exc}", flush=True)
        return False


def _maybe_schedule_storage_restart() -> bool:
    pre_restart_root_state: dict[str, object] = {}
    try:
        config_manager = get_config_manager(APP_NAME, migrate=False)
        load_root_state = getattr(config_manager, "load_root_state", None)
        if callable(load_root_state):
            loaded_root_state = load_root_state()
            if isinstance(loaded_root_state, dict):
                pre_restart_root_state = loaded_root_state
    except Exception as exc:
        print(f"[Launcher] Warning: failed to inspect root_state before restart scheduling: {exc}", flush=True)

    storage_bootstrap = _resolve_storage_layout_for_launch()
    migration_result = storage_bootstrap.get("migration_result") or {}
    restart_reason = ""

    if bool(migration_result.get("attempted")):
        restart_reason = "migration"
    else:
        root_mode = str(pre_restart_root_state.get("mode") or "").strip()
        last_migration_result = str(pre_restart_root_state.get("last_migration_result") or "").strip()
        last_migration_source = str(pre_restart_root_state.get("last_migration_source") or "").strip()
        previous_current_root = str(pre_restart_root_state.get("current_root") or "").strip()
        layout = storage_bootstrap.get("layout") if isinstance(storage_bootstrap.get("layout"), dict) else {}
        resolved_selected_root = str(layout.get("selected_root") or "").strip()
        if (
            root_mode == ROOT_MODE_MAINTENANCE_READONLY
            and last_migration_result.startswith("restart_rebind:")
        ):
            restart_reason = "rebind_only"
        elif (
            resolved_selected_root
            and previous_current_root
            and last_migration_source
            and paths_equal(last_migration_source, resolved_selected_root)
            and not paths_equal(previous_current_root, resolved_selected_root)
        ):
            restart_reason = "rebind_only"

    if not restart_reason:
        return False

    emit_frontend_event(
        "storage_migration_restart",
        {
            "completed": bool(migration_result.get("completed")) or restart_reason == "rebind_only",
            "error_code": str(migration_result.get("error_code") or ""),
            "error_message": str(migration_result.get("error_message") or ""),
            "layout": storage_bootstrap.get("layout") or {},
            "restart_reason": restart_reason,
        },
    )
    release_startup_lock()
    _spawn_restarted_launcher()
    return True


def _persist_post_startup_root_state(config_manager) -> None:
    current_root_state = config_manager.load_root_state()
    if should_write_root_mode_normal_after_startup(current_root_state):
        set_root_mode(
            config_manager,
            ROOT_MODE_NORMAL,
            current_root=str(config_manager.app_docs_dir),
            last_known_good_root=str(config_manager.app_docs_dir),
            last_successful_boot_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
        )
        return

    print(
        "[Launcher] Preserving non-normal root_state after startup: "
        f"{current_root_state.get('mode') or ROOT_MODE_NORMAL}",
        flush=True,
    )


def report_startup_failure(message: str, show_dialog: bool = True):
    """统一报告启动失败信息：终端 + （可选）弹窗。"""
    normalized_message = str(message or "").strip().lower()
    if _is_expected_launcher_shutdown() and normalized_message.startswith(("start failed", "startup failed", "startup timeout", "startup aborted")):
        print(f"[Launcher] Suppressed startup failure during expected shutdown: {message}", flush=True)
        return
    print(message, flush=True)
    emit_frontend_event("startup_failure", {"message": message})
    if show_dialog and IS_FROZEN:
        _show_error_dialog(message)


def _get_last_error() -> int:
    """获取最近一次 Win32 错误码。"""
    if sys.platform != 'win32':
        return 0
    return ctypes.windll.kernel32.GetLastError()


def _detach_child_process_session() -> None:
    """Keep launcher-managed child servers out of the launcher's Ctrl+C process group.

    Without this on macOS/Linux, terminal SIGINT reaches the launcher and all child
    servers at once. That lets ``memory_server`` exit before ``main_server`` finishes
    its shutdown release/cleanup sequence, which defeats the cloudsave cleanup order.
    """
    if os.name != "posix":
        return
    try:
        os.setsid()
    except Exception as e:
        print(f"[Launcher] Warning: failed to detach child process session: {e}", flush=True)


def _iter_servers_for_shutdown():
    order = {module_name: index for index, module_name in enumerate(SHUTDOWN_MODULE_ORDER)}
    return sorted(
        SERVERS,
        key=lambda server: (order.get(server.get("module", ""), len(order)), server.get("name", "")),
    )


def setup_job_object():
    """
    创建 Windows Job Object 并将当前进程加入其中。
    设置 JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE 标志，
    这样当主进程被 kill 时，OS 会自动终止所有子进程，
    防止孤儿进程悬挂。
    """
    global JOB_HANDLE
    if sys.platform != 'win32':
        return None

    try:
        kernel32 = ctypes.windll.kernel32

        # Job Object 常量
        JOB_OBJECT_EXTENDED_LIMIT_INFORMATION = 9
        JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x2000

        # 先检查当前进程是否已在某个 Job 中（Steam 场景常见）
        is_in_job = ctypes.c_int(0)
        current_process = kernel32.GetCurrentProcess()
        if not kernel32.IsProcessInJob(current_process, None, ctypes.byref(is_in_job)):
            print(f"[Launcher] Warning: IsProcessInJob failed (err={_get_last_error()})", flush=True)
            is_in_job.value = 0

        # 创建 Job Object
        job = kernel32.CreateJobObjectW(None, None)
        if not job:
            print(f"[Launcher] Warning: Failed to create Job Object (err={_get_last_error()})", flush=True)
            return None

        # 设置 Job Object 信息
        # JOBOBJECT_EXTENDED_LIMIT_INFORMATION 结构体
        # 我们只需要设置 BasicLimitInformation.LimitFlags
        class JOBOBJECT_BASIC_LIMIT_INFORMATION(ctypes.Structure):
            _fields_ = [
                ('PerProcessUserTimeLimit', ctypes.c_int64),
                ('PerJobUserTimeLimit', ctypes.c_int64),
                ('LimitFlags', ctypes.c_uint32),
                ('MinimumWorkingSetSize', ctypes.c_size_t),
                ('MaximumWorkingSetSize', ctypes.c_size_t),
                ('ActiveProcessLimit', ctypes.c_uint32),
                ('Affinity', ctypes.c_size_t),
                ('PriorityClass', ctypes.c_uint32),
                ('SchedulingClass', ctypes.c_uint32),
            ]

        class IO_COUNTERS(ctypes.Structure):
            _fields_ = [
                ('ReadOperationCount', ctypes.c_uint64),
                ('WriteOperationCount', ctypes.c_uint64),
                ('OtherOperationCount', ctypes.c_uint64),
                ('ReadTransferCount', ctypes.c_uint64),
                ('WriteTransferCount', ctypes.c_uint64),
                ('OtherTransferCount', ctypes.c_uint64),
            ]

        class JOBOBJECT_EXTENDED_LIMIT_INFORMATION(ctypes.Structure):
            _fields_ = [
                ('BasicLimitInformation', JOBOBJECT_BASIC_LIMIT_INFORMATION),
                ('IoInfo', IO_COUNTERS),
                ('ProcessMemoryLimit', ctypes.c_size_t),
                ('JobMemoryLimit', ctypes.c_size_t),
                ('PeakProcessMemoryUsed', ctypes.c_size_t),
                ('PeakJobMemoryUsed', ctypes.c_size_t),
            ]

        info = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()
        info.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE

        result = kernel32.SetInformationJobObject(
            job,
            JOB_OBJECT_EXTENDED_LIMIT_INFORMATION,
            ctypes.byref(info),
            ctypes.sizeof(info)
        )
        if not result:
            print(f"[Launcher] Warning: Failed to set Job Object info (err={_get_last_error()})", flush=True)
            kernel32.CloseHandle(job)
            return None

        # 将当前进程加入 Job Object
        result = kernel32.AssignProcessToJobObject(job, current_process)
        if not result:
            err = _get_last_error()
            if is_in_job.value:
                print(
                    f"[Launcher] Warning: Process is already inside another Job; "
                    f"nested Job assignment failed (err={err}). "
                    "Will rely on explicit process-tree cleanup fallback.",
                    flush=True
                )
            else:
                print(f"[Launcher] Warning: Failed to assign process to Job Object (err={err})", flush=True)
            kernel32.CloseHandle(job)
            return None

        # 保持 handle 在进程生命周期内有效（模块级引用）
        # 进程退出时句柄会关闭，触发 KILL_ON_JOB_CLOSE
        JOB_HANDLE = job
        print("[Launcher] Job Object created - child processes will auto-terminate on exit", flush=True)
        return job

    except Exception as e:
        print(f"[Launcher] Warning: Job Object setup failed: {e}", flush=True)
        return None

# 服务器配置（按内存占用从轻到重排列，用于分步启动以降低峰值内存）
SERVERS = [
    {
        'name': 'Memory Server',
        'module': 'memory_server',
        'port': MEMORY_SERVER_PORT,
        'process': None,
        'ready_event': None,
        'shutdown_complete_event': None,
        'graceful_shutdown_timeout': 12,
    },
    {
        'name': 'Main Server',
        'module': 'main_server',
        'port': MAIN_SERVER_PORT,
        'process': None,
        'ready_event': None,
        'shutdown_complete_event': None,
        'graceful_shutdown_timeout': 20,
    },
    {
        'name': 'Agent Server',
        'module': 'agent_server',
        'port': TOOL_SERVER_PORT,
        'process': None,
        'ready_event': None,
        'shutdown_complete_event': None,
        'graceful_shutdown_timeout': 8,
    },
]

# 不再启动主程序，用户自己启动 lanlan_frd.exe


# ===== 合并进程模式 =====
# 打包时三个 FastAPI 服务跑在同一个进程里，共享 Python 运行时，
# 省掉 2 份 CPython + uvicorn + 共享库的重复加载（约 150-200 MB）。
# 每个服务仍然监听自己的端口，前端 / 服务间 HTTP 调用零改动。

def run_merged_servers() -> int:
    """单进程合并模式：3 个 uvicorn.Server 共享一个 asyncio event loop。"""
    import asyncio
    import uvicorn

    _reload_runtime_config_from_env()

    # frozen 环境通用设置
    if IS_FROZEN:
        if hasattr(sys, '_MEIPASS'):
            os.chdir(sys._MEIPASS)
        else:
            os.chdir(os.path.dirname(os.path.abspath(__file__)))
        try:
            import typeguard
            _dummy = lambda func=None, **kw: func if func else (lambda f: f)
            typeguard.typechecked = _dummy
            if hasattr(typeguard, '_decorators'):
                typeguard._decorators.typechecked = _dummy
        except Exception:
            pass

    _behind_proxy = os.environ.get("NEKO_BEHIND_PROXY", "").strip().lower() in ("1", "true", "yes")
    _proxy_kw: dict = {}
    if _behind_proxy:
        _proxy_kw = {"proxy_headers": True, "forwarded_allow_ips": "*"}

    # 分步 import（控制峰值内存 & 提供进度反馈）
    print("[Merged] Importing memory_server...", flush=True)
    from app import memory_server
    print("[Merged] Importing agent_server...", flush=True)
    from app import agent_server
    print("[Merged] Importing main_server...", flush=True)
    from app import main_server

    _apps = [
        (memory_server.app, MEMORY_SERVER_PORT, "Memory"),
        (agent_server.app,  TOOL_SERVER_PORT,   "Agent"),
        (main_server.app,   MAIN_SERVER_PORT,   "Main"),
    ]

    servers: list[uvicorn.Server] = []
    for _app, _port, _name in _apps:
        cfg = uvicorn.Config(
            app=_app, host="127.0.0.1", port=_port,
            log_level="error", **_proxy_kw,
        )
        servers.append(uvicorn.Server(cfg))

    # ── 信号处理 ──
    # 3 个 uvicorn.Server 各自 install_signal_handlers() 会互相覆盖
    # （最后一个赢），导致 Ctrl+C 只通知 1 个退出，其余卡死。
    # 禁用各自的处理器，统一安装一个全局处理器。
    for s in servers:
        s.install_signal_handlers = lambda: None

    _exiting = False
    _shutdown_watchdog_started = False

    def _begin_merged_shutdown(*, reason: str = "signal") -> bool:
        nonlocal _exiting, _shutdown_watchdog_started
        if _exiting:
            return False
        _exiting = True
        _mark_expected_launcher_shutdown()
        watchdog_timeout = 30 if reason == "storage_location_restart" else 10
        print(
            f"\n[Merged] Shutting down... (reason={reason}, watchdog={watchdog_timeout}s)",
            flush=True,
        )
        for s in servers:
            s.should_exit = True
        if not _shutdown_watchdog_started:
            threading.Thread(
                target=lambda timeout=watchdog_timeout: (time.sleep(timeout), os._exit(1)),
                daemon=True,
                name="merged-shutdown-watchdog",
            ).start()
            _shutdown_watchdog_started = True
        return True

    def _on_exit_signal(_sig, _frame):
        nonlocal _exiting
        if _exiting:
            # 第二次 Ctrl+C → 强制退出（与多进程模式行为一致）
            print("\n[Merged] Force exit!", flush=True)
            os._exit(1)
        _begin_merged_shutdown(reason=f"signal:{_sig}")

    try:
        main_server.set_start_config(
            {
                "browser_mode_enabled": False,
                "browser_page": "",
                "shutdown_memory_server_on_exit": False,
                "request_runtime_shutdown": lambda: _begin_merged_shutdown(
                    reason="storage_location_restart"
                ),
                "server": None,
            }
        )
    except Exception as exc:
        print(f"[Merged] Warning: failed to install merged shutdown bridge: {exc}", flush=True)

    _prev_sigint = signal.getsignal(signal.SIGINT)
    _prev_sigterm = signal.getsignal(signal.SIGTERM)
    signal.signal(signal.SIGINT, _on_exit_signal)
    signal.signal(signal.SIGTERM, _on_exit_signal)

    async def _serve_all() -> None:
        # 并发启动所有 uvicorn.Server
        tasks = [asyncio.create_task(s.serve()) for s in servers]

        # 等所有端口可达后通知前端
        for _ in range(120):
            if all(check_port(p) for _, p, _ in _apps):
                break
            await asyncio.sleep(0.25)

        print(f"[Merged] All servers ready "
              f"(ports {MEMORY_SERVER_PORT}/{TOOL_SERVER_PORT}/{MAIN_SERVER_PORT})",
              flush=True)
        try:
            _config_manager = get_config_manager(APP_NAME)
            _persist_post_startup_root_state(_config_manager)
        except Exception as e:
            print(f"[Merged] Warning: failed to persist root_state boot success: {e}", flush=True)
        emit_frontend_event("startup_ready", {
            "instance_id": INSTANCE_ID,
            "selected": {
                "MAIN_SERVER_PORT": MAIN_SERVER_PORT,
                "MEMORY_SERVER_PORT": MEMORY_SERVER_PORT,
                "TOOL_SERVER_PORT": TOOL_SERVER_PORT,
            },
        })

        # 等所有 server 退出（收到 should_exit 后各自触发 FastAPI shutdown 事件）
        await asyncio.gather(*tasks)

    try:
        asyncio.run(_serve_all())
    except KeyboardInterrupt:
        # 备用路径：如果自定义信号处理器未拦截到（理论上不会走到这里）
        if not _exiting:
            for s in servers:
                s.should_exit = True
    finally:
        signal.signal(signal.SIGINT, _prev_sigint)
        signal.signal(signal.SIGTERM, _prev_sigterm)

    return 0