1717 description : " 抓取模式:auto/standard/skims(auto 按 fetch-days 阈值决定,standard/ skims 强制)"
1818 required : false
1919 default : " auto"
20+ profile_tag :
21+ description : " 仅运行指定词条 tag;留空表示全量运行"
22+ required : false
23+ default : " "
2024
2125permissions :
2226 contents : write
@@ -31,10 +35,16 @@ jobs:
3135 timeout-minutes : 120
3236 env :
3337 BLT_API_KEY : ${{ secrets.BLT_API_KEY }}
34- BLT_REWRITE_MODEL : gemini-3-flash-preview
38+ BLT_REWRITE_MODEL : ${{ secrets.BLT_REWRITE_MODEL }}
3539 BLT_RERANK_MODEL : qwen3-reranker-4b
36- BLT_FILTER_MODEL : gemini-3-flash-preview-nothinking
40+ BLT_FILTER_MODEL : ${{ secrets.BLT_FILTER_MODEL }}
3741 DPR_EMBED_API_TIMEOUT : " 60"
42+ LLM_PRIMARY_BASE_URL : ${{ secrets.LLM_PRIMARY_BASE_URL }}
43+ BLT_PRIMARY_BASE_URL : ${{ secrets.BLT_PRIMARY_BASE_URL }}
44+ BLT_API_BASE : ${{ secrets.BLT_PRIMARY_BASE_URL }}
45+ SUMMARY_API_KEY : ${{ secrets.SUMMARY_API_KEY }}
46+ SUMMARY_BASE_URL : ${{ secrets.SUMMARY_BASE_URL }}
47+ SUMMARY_MODEL : ${{ secrets.SUMMARY_MODEL }}
3848 PYTHONUNBUFFERED : " 1"
3949
4050 steps :
@@ -55,13 +65,23 @@ jobs:
5565 with :
5666 python-version : " 3.11"
5767
68+ - name : Setup Java
69+ uses : actions/setup-java@v5
70+ with :
71+ distribution : temurin
72+ java-version : " 17"
73+
5874 - name : Cache Python deps
5975 uses : actions/cache@v5
6076 with :
6177 path : |
6278 ~/.cache/pip
6379 ~/.cache/uv
6480 ~/.cache/torch
81+ ~/.cache/dpr-tools/pdffigures2
82+ ~/.cache/coursier
83+ ~/.ivy2/cache
84+ ~/.sbt
6585 key : ${{ runner.os }}-dpr-embed-deps-v1-${{ hashFiles('requirements.txt') }}
6686
6787 - name : Install deps (skip sqlite3)
7696 python -m pip install uv
7797 uv pip install --system -r /tmp/req.txt
7898
99+ - name : Prepare pdffigures2
100+ run : |
101+ set -euo pipefail
102+
103+ TOOL_DIR="$HOME/.cache/dpr-tools/pdffigures2"
104+ SRC_DIR="$TOOL_DIR/src"
105+ JAR_PATH="$TOOL_DIR/pdffigures2.jar"
106+
107+ mkdir -p "$TOOL_DIR"
108+ if [ ! -f "$JAR_PATH" ]; then
109+ rm -rf "$SRC_DIR"
110+ git clone --depth 1 https://github.com/allenai/pdffigures2 "$SRC_DIR"
111+ cd "$SRC_DIR"
112+ curl -fsSL -o sbt.tgz https://github.com/sbt/sbt/releases/download/v1.10.1/sbt-1.10.1.tgz
113+ tar -xzf sbt.tgz
114+ ./sbt/bin/sbt assembly
115+ cp pdffigures2.jar "$JAR_PATH"
116+ fi
117+
118+ echo "PDFFIGURES2_JAR=$JAR_PATH" >> "$GITHUB_ENV"
119+
79120 - name : Run Retrieval and Embedding
80121 env :
81122 HF_HOME : ${{ github.workspace }}/hf_cache
@@ -138,17 +179,21 @@ jobs:
138179 RUN_ENRICH="${{ github.event.inputs.run_enrich }}"
139180 FETCH_DAYS="${{ github.event.inputs.fetch_days }}"
140181 FETCH_MODE="${{ github.event.inputs.fetch_mode }}"
141- ARGS=""
182+ PROFILE_TAG="${{ github.event.inputs.profile_tag }}"
183+ ARGS=()
142184 if [ "$RUN_ENRICH" = "true" ]; then
143- ARGS=" --run-enrich"
185+ ARGS+=( --run-enrich)
144186 fi
145187 if [ -n "$FETCH_DAYS" ]; then
146- ARGS="$ARGS --fetch-days $FETCH_DAYS"
188+ ARGS+=( --fetch-days " $FETCH_DAYS")
147189 fi
148190 if [ -n "$FETCH_MODE" ]; then
149- ARGS="$ARGS --fetch-mode $FETCH_MODE"
191+ ARGS+=(--fetch-mode "$FETCH_MODE")
192+ fi
193+ if [ -n "$PROFILE_TAG" ]; then
194+ ARGS+=(--profile-tag "$PROFILE_TAG")
150195 fi
151- python src/main.py $ ARGS --embedding-device cpu --embedding-batch-size 8
196+ python src/main.py "${ ARGS[@]}" --embedding-device cpu --embedding-batch-size 8
152197
153198 # 让前端可以在不配置 GitHub Token 的情况下直接读取配置(只读快照)
154199 cp -f config.yaml docs/config.yaml
0 commit comments