Skip to content

Commit 94cbf47

Browse files
wehosHongzhi Wenclaude
authored
fix(docker): embedding 模型下载加镜像源 fallback,根治 HF 429 (Project-N-E-K-O#1640)
Project-N-E-K-O#1630 的 30s 退避扛不住 huggingface.co 按小时的 per-IP 限流,真正挡住 429 的 是 GHA layer cache;cache 被驱逐后退回直连 HF,于是间歇性 429(full 比 standard 更频繁——多下 849MB fp32,下载层更大更易被驱逐)。 prepare_embedding_model.py 加镜像源 fallback: - 抽出 _download_one(单 URL 退避重试,逻辑不变;urlopen 改带稳定 UA 的 Request,防 CDN 拒默认 Python-urllib UA)。 - _download 外包一层,对每个文件按 huggingface.co -> hf-mirror.com 顺序尝试, HF 持续 429 / 404 / 不可达就切下一镜像,每源各跑完整退避重试,全源失败才抛。 顺序可由 HF_ENDPOINTS / HF_ENDPOINT 覆盖,默认内置,两个 Dockerfile + 两个 desktop workflow 共享脚本,无需改动。 新增 tests/unit/test_prepare_embedding_model.py 覆盖 fallback / env 覆盖 / 404 穿透 / 全源失败 / 首源命中不打镜像 / UA。 Co-authored-by: Hongzhi Wen <cartabio.coder1@gmail.com> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 8e77535 commit 94cbf47

2 files changed

Lines changed: 259 additions & 13 deletions

File tree

scripts/prepare_embedding_model.py

Lines changed: 93 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,32 @@
2525
DEFAULT_OUTPUT_ROOT = Path("data") / "embedding_models"
2626
PREPARED_MARKER = ".prepared.json"
2727

28-
# Download resilience. huggingface.co rate-limits by source IP, and the Docker
29-
# build runs several arch/variant jobs from a shared proxy egress, so a single
30-
# urlopen routinely hit HTTP 429 and killed the whole build with no recovery.
31-
# Retry transient failures (429 / 5xx / connection errors) with exponential
32-
# backoff, honoring a numeric Retry-After when the server sends one.
28+
# Download resilience. huggingface.co rate-limits anonymous requests per source
29+
# IP by the hour, and CI runs from shared runner / proxy egress IPs that are
30+
# chronically throttled, so a direct fetch routinely hit HTTP 429 and killed the
31+
# whole build with no recovery. Two defenses stack here:
32+
# 1. Mirror fallback: each file is tried against every endpoint in order
33+
# (huggingface.co first, then the hf-mirror.com reverse proxy). A source
34+
# that 429s or is unreachable falls through to the next instead of failing
35+
# the build. Override the list/order via HF_ENDPOINTS (comma-separated) or a
36+
# single HF_ENDPOINT (the huggingface_hub convention).
37+
# 2. Per-endpoint backoff: within one endpoint, retry transient failures
38+
# (429 / 5xx / connection errors) with exponential backoff, honoring a
39+
# numeric Retry-After when the server sends one.
40+
# A bounded ~30s backoff alone can't outwait an hourly per-IP limit; the mirror
41+
# fallback is what actually breaks the deadlock when the runner IP is throttled.
3342
_RETRYABLE_STATUS = frozenset({429, 500, 502, 503, 504})
3443
_MAX_ATTEMPTS = 5
3544
_BACKOFF_BASE_SECONDS = 2.0
3645
_BACKOFF_CAP_SECONDS = 60.0
46+
# Mirror endpoints tried in order. hf-mirror.com mirrors the full
47+
# /{repo}/resolve/{revision}/{file} layout and is not under the same per-IP
48+
# throttle, so it recovers builds when huggingface.co rate-limits the shared CI
49+
# egress. Kept as a plain default so neither Dockerfile nor the desktop
50+
# workflows need to pass anything.
51+
DEFAULT_ENDPOINTS = ("https://huggingface.co", "https://hf-mirror.com")
52+
# Some CDNs reject the default "Python-urllib/x.y" agent; send a stable one.
53+
_USER_AGENT = "neko-embedding-prepare/1.0 (+https://github.com/Project-N-E-K-O/N.E.K.O)"
3754
# 40-char lowercase hex git SHA. Tags / branch refs / short SHAs are rejected
3855
# so the profile id stays a strict compatibility contract — anything that can
3956
# move under our feet, even tags (which can be force-pushed), is excluded.
@@ -85,18 +102,36 @@ def _backoff_seconds(attempt: int) -> float:
85102
return min(_BACKOFF_BASE_SECONDS * (2 ** (attempt - 1)), _BACKOFF_CAP_SECONDS)
86103

87104

88-
def _download(url: str, dest: Path, *, force: bool) -> None:
89-
if dest.exists() and dest.stat().st_size > 0 and not force:
90-
print(f"[embedding-model] keep existing {dest}")
91-
return
105+
def _endpoints() -> list[str]:
106+
"""Ordered HF-compatible base URLs to try for each file.
107+
108+
``HF_ENDPOINTS`` (comma-separated) takes precedence and fully replaces the
109+
default order; a single ``HF_ENDPOINT`` (the huggingface_hub convention) is
110+
honored next and pins to that one mirror. Otherwise the built-in
111+
huggingface.co -> hf-mirror.com fallback is used.
112+
"""
113+
raw = os.environ.get("HF_ENDPOINTS") or os.environ.get("HF_ENDPOINT")
114+
if raw:
115+
eps = [item.strip().rstrip("/") for item in raw.split(",") if item.strip()]
116+
if eps:
117+
return eps
118+
return list(DEFAULT_ENDPOINTS)
92119

120+
121+
def _download_one(url: str, dest: Path) -> None:
122+
"""Fetch one URL into ``dest`` with bounded exponential-backoff retry.
123+
124+
Raises ``RuntimeError`` when retries are exhausted or the server returns a
125+
non-retryable status (e.g. 404), so the caller can fall back to the next
126+
mirror.
127+
"""
93128
dest.parent.mkdir(parents=True, exist_ok=True)
94129
tmp = dest.with_suffix(dest.suffix + ".tmp")
95-
print(f"[embedding-model] download {url}")
130+
request = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT})
96131

97132
for attempt in range(1, _MAX_ATTEMPTS + 1):
98133
try:
99-
with urllib.request.urlopen(url, timeout=120) as response:
134+
with urllib.request.urlopen(request, timeout=120) as response:
100135
with tmp.open("wb") as f:
101136
while True:
102137
chunk = response.read(1024 * 1024)
@@ -145,6 +180,44 @@ def _download(url: str, dest: Path, *, force: bool) -> None:
145180
time.sleep(delay)
146181

147182

183+
def _download(
184+
rel: str,
185+
dest: Path,
186+
*,
187+
repo: str,
188+
revision: str,
189+
endpoints: list[str],
190+
force: bool,
191+
) -> None:
192+
"""Download one repo file into ``dest``, trying each mirror in order.
193+
194+
Each endpoint gets its own bounded backoff retry; a source that exhausts its
195+
retries (e.g. a persistent 429) or returns a non-retryable status falls
196+
through to the next mirror. Only when every endpoint fails does this raise.
197+
"""
198+
if dest.exists() and dest.stat().st_size > 0 and not force:
199+
print(f"[embedding-model] keep existing {dest}")
200+
return
201+
202+
failures: list[str] = []
203+
for index, base in enumerate(endpoints, 1):
204+
url = f"{base}/{repo}/resolve/{revision}/{rel}"
205+
suffix = f" (source {index}/{len(endpoints)})" if len(endpoints) > 1 else ""
206+
print(f"[embedding-model] download {url}{suffix}")
207+
try:
208+
_download_one(url, dest)
209+
return
210+
except RuntimeError as exc:
211+
failures.append(str(exc))
212+
if index < len(endpoints):
213+
print(f"[embedding-model] source {base} failed; falling back to next mirror")
214+
215+
raise RuntimeError(
216+
f"failed to download {rel} from all {len(endpoints)} source(s): "
217+
+ " | ".join(failures)
218+
)
219+
220+
148221
def _verify(profile_dir: Path, files: list[str]) -> None:
149222
missing = [
150223
str(profile_dir / rel)
@@ -234,9 +307,16 @@ def main(argv: list[str] | None = None) -> int:
234307
f"forcing re-download for {args.repo}@{args.revision}",
235308
)
236309

310+
endpoints = _endpoints()
237311
for rel in files:
238-
url = f"https://huggingface.co/{args.repo}/resolve/{args.revision}/{rel}"
239-
_download(url, profile_dir / rel, force=args.force or revision_changed)
312+
_download(
313+
rel,
314+
profile_dir / rel,
315+
repo=args.repo,
316+
revision=args.revision,
317+
endpoints=endpoints,
318+
force=args.force or revision_changed,
319+
)
240320
_verify(profile_dir, files)
241321
_write_marker(profile_dir, args.repo, args.revision)
242322
print(f"[embedding-model] profile ready: {profile_dir}")
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# -*- coding: utf-8 -*-
2+
"""Unit tests for scripts/prepare_embedding_model.py — the build-time embedding
3+
model downloader.
4+
5+
Focus is the mirror-fallback layer added to survive huggingface.co's per-IP
6+
HTTP 429 throttling of shared CI egress IPs: each file is tried against every
7+
endpoint in order (huggingface.co -> hf-mirror.com by default), each endpoint
8+
getting its own bounded backoff retry, and only an all-source failure raises.
9+
10+
Pure stdlib + mock: no network, no numpy/onnxruntime, so this runs on every
11+
workstation regardless of the embedding bundle state.
12+
"""
13+
from __future__ import annotations
14+
15+
import importlib.util
16+
import io
17+
import urllib.error
18+
import urllib.request
19+
from pathlib import Path
20+
21+
import pytest
22+
23+
24+
_SCRIPT = Path(__file__).resolve().parents[2] / "scripts" / "prepare_embedding_model.py"
25+
26+
27+
@pytest.fixture
28+
def prep(monkeypatch):
29+
"""Load the script as a module and neutralize real backoff sleeps."""
30+
spec = importlib.util.spec_from_file_location("prepare_embedding_model", _SCRIPT)
31+
module = importlib.util.module_from_spec(spec)
32+
spec.loader.exec_module(module)
33+
monkeypatch.setattr(module.time, "sleep", lambda *_: None)
34+
# Default env: tests opt into HF_ENDPOINT(S) explicitly where relevant.
35+
monkeypatch.delenv("HF_ENDPOINTS", raising=False)
36+
monkeypatch.delenv("HF_ENDPOINT", raising=False)
37+
return module
38+
39+
40+
class _FakeResp(io.BytesIO):
41+
status = 200
42+
43+
def __enter__(self):
44+
return self
45+
46+
def __exit__(self, *exc):
47+
self.close()
48+
49+
50+
def _http_error(url, code):
51+
return urllib.error.HTTPError(url, code, "err", {}, None)
52+
53+
54+
def _install_urlopen(monkeypatch, prep, behavior, *, record=None):
55+
"""Patch the module's urlopen with a callable mapping url -> bytes | Exception."""
56+
57+
def fake_urlopen(request, timeout=120):
58+
url = request.full_url if hasattr(request, "full_url") else request
59+
if record is not None:
60+
record.append(url)
61+
result = behavior(url)
62+
if isinstance(result, Exception):
63+
raise result
64+
return _FakeResp(result)
65+
66+
monkeypatch.setattr(prep.urllib.request, "urlopen", fake_urlopen)
67+
68+
69+
# --- endpoint resolution ---------------------------------------------------
70+
71+
def test_endpoints_default(prep):
72+
assert prep._endpoints() == ["https://huggingface.co", "https://hf-mirror.com"]
73+
74+
75+
def test_hf_endpoint_pins_single_mirror(prep, monkeypatch):
76+
monkeypatch.setenv("HF_ENDPOINT", "https://hf-mirror.com")
77+
assert prep._endpoints() == ["https://hf-mirror.com"]
78+
79+
80+
def test_hf_endpoints_list_takes_precedence_and_is_cleaned(prep, monkeypatch):
81+
monkeypatch.setenv("HF_ENDPOINT", "https://ignored.example")
82+
monkeypatch.setenv("HF_ENDPOINTS", "https://a.example/ , https://b.example ,, ")
83+
assert prep._endpoints() == ["https://a.example", "https://b.example"]
84+
85+
86+
# --- download fallback -----------------------------------------------------
87+
88+
def test_falls_back_to_mirror_on_persistent_429(prep, monkeypatch, tmp_path):
89+
def behavior(url):
90+
if "huggingface.co" in url:
91+
return _http_error(url, 429)
92+
return b"FROM-MIRROR"
93+
94+
_install_urlopen(monkeypatch, prep, behavior)
95+
dest = tmp_path / "tokenizer.json"
96+
prep._download(
97+
"tokenizer.json", dest, repo="r/m", revision="abc",
98+
endpoints=["https://huggingface.co", "https://hf-mirror.com"], force=False,
99+
)
100+
assert dest.read_bytes() == b"FROM-MIRROR"
101+
102+
103+
def test_non_retryable_404_on_first_source_still_falls_through(prep, monkeypatch, tmp_path):
104+
def behavior(url):
105+
if "huggingface.co" in url:
106+
return _http_error(url, 404)
107+
return b"OK2"
108+
109+
_install_urlopen(monkeypatch, prep, behavior)
110+
dest = tmp_path / "x"
111+
prep._download(
112+
"x", dest, repo="r/m", revision="abc",
113+
endpoints=["https://huggingface.co", "https://hf-mirror.com"], force=False,
114+
)
115+
assert dest.read_bytes() == b"OK2"
116+
117+
118+
def test_all_sources_failing_raises_listing_every_source(prep, monkeypatch, tmp_path):
119+
_install_urlopen(monkeypatch, prep, lambda url: _http_error(url, 429))
120+
with pytest.raises(RuntimeError) as excinfo:
121+
prep._download(
122+
"f.bin", tmp_path / "f.bin", repo="r/m", revision="abc",
123+
endpoints=["https://huggingface.co", "https://hf-mirror.com"], force=False,
124+
)
125+
message = str(excinfo.value)
126+
assert "all 2 source(s)" in message
127+
assert "huggingface.co" in message and "hf-mirror.com" in message
128+
129+
130+
def test_primary_success_does_not_contact_mirror(prep, monkeypatch, tmp_path):
131+
seen: list[str] = []
132+
_install_urlopen(monkeypatch, prep, lambda url: b"PRIMARY", record=seen)
133+
dest = tmp_path / "x"
134+
prep._download(
135+
"x", dest, repo="r/m", revision="abc",
136+
endpoints=["https://huggingface.co", "https://hf-mirror.com"], force=False,
137+
)
138+
assert dest.read_bytes() == b"PRIMARY"
139+
assert len(seen) == 1 and "huggingface.co" in seen[0]
140+
141+
142+
def test_existing_nonempty_file_is_kept_without_any_request(prep, monkeypatch, tmp_path):
143+
dest = tmp_path / "x"
144+
dest.write_bytes(b"already-here")
145+
146+
def explode(url):
147+
raise AssertionError("should not hit the network when file already exists")
148+
149+
_install_urlopen(monkeypatch, prep, explode)
150+
prep._download(
151+
"x", dest, repo="r/m", revision="abc",
152+
endpoints=["https://huggingface.co", "https://hf-mirror.com"], force=False,
153+
)
154+
assert dest.read_bytes() == b"already-here"
155+
156+
157+
def test_sends_stable_user_agent(prep, monkeypatch, tmp_path):
158+
captured: dict[str, str] = {}
159+
160+
def fake_urlopen(request, timeout=120):
161+
captured["ua"] = request.get_header("User-agent")
162+
return _FakeResp(b"data")
163+
164+
monkeypatch.setattr(prep.urllib.request, "urlopen", fake_urlopen)
165+
prep._download_one("https://huggingface.co/r/m/resolve/abc/x", tmp_path / "x")
166+
assert captured["ua"] == prep._USER_AGENT

0 commit comments

Comments
 (0)