Skip to content

Commit a98bf44

Browse files
authored
Merge pull request #25 from 5-xj/upstream-pr-safe-all
[update] 更新了多源,现在暴露了bioRxiv,更新了图中插图
2 parents 5623798 + 2ab3c6b commit a98bf44

112 files changed

Lines changed: 12405 additions & 838 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/daily-paper-reader.yml

Lines changed: 52 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ on:
1717
description: "抓取模式:auto/standard/skims(auto 按 fetch-days 阈值决定,standard/ skims 强制)"
1818
required: false
1919
default: "auto"
20+
profile_tag:
21+
description: "仅运行指定词条 tag;留空表示全量运行"
22+
required: false
23+
default: ""
2024

2125
permissions:
2226
contents: write
@@ -31,10 +35,16 @@ jobs:
3135
timeout-minutes: 120
3236
env:
3337
BLT_API_KEY: ${{ secrets.BLT_API_KEY }}
34-
BLT_REWRITE_MODEL: gemini-3-flash-preview
38+
BLT_REWRITE_MODEL: ${{ secrets.BLT_REWRITE_MODEL }}
3539
BLT_RERANK_MODEL: qwen3-reranker-4b
36-
BLT_FILTER_MODEL: gemini-3-flash-preview-nothinking
40+
BLT_FILTER_MODEL: ${{ secrets.BLT_FILTER_MODEL }}
3741
DPR_EMBED_API_TIMEOUT: "60"
42+
LLM_PRIMARY_BASE_URL: ${{ secrets.LLM_PRIMARY_BASE_URL }}
43+
BLT_PRIMARY_BASE_URL: ${{ secrets.BLT_PRIMARY_BASE_URL }}
44+
BLT_API_BASE: ${{ secrets.BLT_PRIMARY_BASE_URL }}
45+
SUMMARY_API_KEY: ${{ secrets.SUMMARY_API_KEY }}
46+
SUMMARY_BASE_URL: ${{ secrets.SUMMARY_BASE_URL }}
47+
SUMMARY_MODEL: ${{ secrets.SUMMARY_MODEL }}
3848
PYTHONUNBUFFERED: "1"
3949

4050
steps:
@@ -55,13 +65,23 @@ jobs:
5565
with:
5666
python-version: "3.11"
5767

68+
- name: Setup Java
69+
uses: actions/setup-java@v5
70+
with:
71+
distribution: temurin
72+
java-version: "17"
73+
5874
- name: Cache Python deps
5975
uses: actions/cache@v5
6076
with:
6177
path: |
6278
~/.cache/pip
6379
~/.cache/uv
6480
~/.cache/torch
81+
~/.cache/dpr-tools/pdffigures2
82+
~/.cache/coursier
83+
~/.ivy2/cache
84+
~/.sbt
6585
key: ${{ runner.os }}-dpr-embed-deps-v1-${{ hashFiles('requirements.txt') }}
6686

6787
- name: Install deps (skip sqlite3)
@@ -76,6 +96,27 @@ jobs:
7696
python -m pip install uv
7797
uv pip install --system -r /tmp/req.txt
7898
99+
- name: Prepare pdffigures2
100+
run: |
101+
set -euo pipefail
102+
103+
TOOL_DIR="$HOME/.cache/dpr-tools/pdffigures2"
104+
SRC_DIR="$TOOL_DIR/src"
105+
JAR_PATH="$TOOL_DIR/pdffigures2.jar"
106+
107+
mkdir -p "$TOOL_DIR"
108+
if [ ! -f "$JAR_PATH" ]; then
109+
rm -rf "$SRC_DIR"
110+
git clone --depth 1 https://github.com/allenai/pdffigures2 "$SRC_DIR"
111+
cd "$SRC_DIR"
112+
curl -fsSL -o sbt.tgz https://github.com/sbt/sbt/releases/download/v1.10.1/sbt-1.10.1.tgz
113+
tar -xzf sbt.tgz
114+
./sbt/bin/sbt assembly
115+
cp pdffigures2.jar "$JAR_PATH"
116+
fi
117+
118+
echo "PDFFIGURES2_JAR=$JAR_PATH" >> "$GITHUB_ENV"
119+
79120
- name: Run Retrieval and Embedding
80121
env:
81122
HF_HOME: ${{ github.workspace }}/hf_cache
@@ -138,17 +179,21 @@ jobs:
138179
RUN_ENRICH="${{ github.event.inputs.run_enrich }}"
139180
FETCH_DAYS="${{ github.event.inputs.fetch_days }}"
140181
FETCH_MODE="${{ github.event.inputs.fetch_mode }}"
141-
ARGS=""
182+
PROFILE_TAG="${{ github.event.inputs.profile_tag }}"
183+
ARGS=()
142184
if [ "$RUN_ENRICH" = "true" ]; then
143-
ARGS="--run-enrich"
185+
ARGS+=(--run-enrich)
144186
fi
145187
if [ -n "$FETCH_DAYS" ]; then
146-
ARGS="$ARGS --fetch-days $FETCH_DAYS"
188+
ARGS+=(--fetch-days "$FETCH_DAYS")
147189
fi
148190
if [ -n "$FETCH_MODE" ]; then
149-
ARGS="$ARGS --fetch-mode $FETCH_MODE"
191+
ARGS+=(--fetch-mode "$FETCH_MODE")
192+
fi
193+
if [ -n "$PROFILE_TAG" ]; then
194+
ARGS+=(--profile-tag "$PROFILE_TAG")
150195
fi
151-
python src/main.py $ARGS --embedding-device cpu --embedding-batch-size 8
196+
python src/main.py "${ARGS[@]}" --embedding-device cpu --embedding-batch-size 8
152197
153198
# 让前端可以在不配置 GitHub Token 的情况下直接读取配置(只读快照)
154199
cp -f config.yaml docs/config.yaml
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
name: maintain-biorxiv
2+
3+
on:
4+
schedule:
5+
- cron: "45 0 * * *" # 北京时间 08:45(UTC 00:45)
6+
- cron: "45 8 * * *" # 北京时间 16:45(UTC 08:45)
7+
- cron: "45 16 * * *" # 北京时间 00:45(UTC 16:45)
8+
workflow_dispatch:
9+
inputs:
10+
fetch_days:
11+
description: "回溯抓取天数(默认 30)"
12+
required: false
13+
default: "30"
14+
force_full_window:
15+
description: "是否忽略 seen 状态并全量回补当前窗口(true/false)"
16+
required: false
17+
default: "false"
18+
19+
permissions:
20+
contents: read
21+
22+
concurrency:
23+
group: daily-paper-reader-maintain-biorxiv
24+
cancel-in-progress: false
25+
26+
jobs:
27+
maintain_biorxiv:
28+
if: github.repository == 'ziwenhahaha/daily-paper-reader' || github.repository == '5-xj/daily-paper-reader'
29+
runs-on: ubuntu-latest
30+
timeout-minutes: 240
31+
env:
32+
SUPABASE_SERVICE_KEY: ${{ secrets.SUPABASE_SERVICE_KEY }}
33+
SUPABASE_BACKEND_KEY: biorxiv
34+
SUPABASE_PAPERS_TABLE: biorxiv_papers
35+
SUPABASE_RETENTION_DAYS: "45"
36+
DPR_ENABLE_BIORXIV_BACKEND: "1"
37+
DPR_BIORXIV_ENABLED: "1"
38+
DPR_BIORXIV_PAPERS_TABLE: biorxiv_papers
39+
DPR_BIORXIV_VECTOR_RPC_EXACT: match_biorxiv_papers_exact
40+
DPR_BIORXIV_BM25_RPC: match_biorxiv_papers_bm25
41+
PYTHONUNBUFFERED: "1"
42+
43+
steps:
44+
- name: Checkout
45+
uses: actions/checkout@v5
46+
47+
- name: Setup Python
48+
uses: actions/setup-python@v6
49+
with:
50+
python-version: "3.11"
51+
52+
- name: Cache pip + torch
53+
uses: actions/cache@v5
54+
with:
55+
path: |
56+
~/.cache/pip
57+
~/.cache/torch
58+
key: ${{ runner.os }}-dpr-biorxiv-hf-v1-${{ hashFiles('requirements.txt') }}
59+
60+
- name: Install deps (skip sqlite3)
61+
run: |
62+
python - <<'PY'
63+
import re
64+
lines = open("requirements.txt", "r", encoding="utf-8").read().splitlines()
65+
lines = [l for l in lines if l.strip() and not re.match(r"^sqlite3\\b", l)]
66+
open("/tmp/req.txt", "w", encoding="utf-8").write("\n".join(lines))
67+
PY
68+
python -m pip install --upgrade pip
69+
python -m pip install uv
70+
uv pip install --system -r /tmp/req.txt
71+
72+
- name: Run bioRxiv Maintain Pipeline
73+
run: |
74+
set -euo pipefail
75+
set -x
76+
77+
if [ -z "${SUPABASE_SERVICE_KEY}" ]; then
78+
echo "[WARN] 未配置 SUPABASE_SERVICE_KEY,已跳过 bioRxiv 维护同步。"
79+
exit 0
80+
fi
81+
82+
FETCH_DAYS="${{ github.event.inputs.fetch_days }}"
83+
FORCE_FULL_WINDOW="${{ github.event.inputs.force_full_window }}"
84+
if [ -z "$FETCH_DAYS" ]; then
85+
FETCH_DAYS="30"
86+
fi
87+
88+
ARGS=(--fetch-days "$FETCH_DAYS")
89+
if [ "${FORCE_FULL_WINDOW:-false}" = "true" ]; then
90+
ARGS+=(--force-full-window)
91+
fi
92+
python src/maintain/biorxiv.py "${ARGS[@]}"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
name: maintain-chemrxiv
2+
3+
on:
4+
schedule:
5+
- cron: "30 1 * * *" # 北京时间 09:30
6+
workflow_dispatch:
7+
inputs:
8+
fetch_days:
9+
description: "回溯抓取天数(默认 400;镜像源)"
10+
required: false
11+
default: "400"
12+
force_full_window:
13+
description: "是否忽略 seen 状态并全量回补当前窗口(true/false)"
14+
required: false
15+
default: "false"
16+
17+
permissions:
18+
contents: read
19+
20+
concurrency:
21+
group: daily-paper-reader-maintain-chemrxiv
22+
cancel-in-progress: false
23+
24+
jobs:
25+
maintain_chemrxiv:
26+
if: github.repository == 'ziwenhahaha/daily-paper-reader' || github.repository == '5-xj/daily-paper-reader'
27+
runs-on: ubuntu-latest
28+
timeout-minutes: 240
29+
env:
30+
SUPABASE_SERVICE_KEY: ${{ secrets.SUPABASE_SERVICE_KEY }}
31+
SUPABASE_BACKEND_KEY: chemrxiv
32+
SUPABASE_PAPERS_TABLE: chemrxiv_papers
33+
SUPABASE_RETENTION_DAYS: "45"
34+
DPR_ENABLE_CHEMRXIV_BACKEND: "1"
35+
DPR_CHEMRXIV_ENABLED: "1"
36+
DPR_CHEMRXIV_PAPERS_TABLE: chemrxiv_papers
37+
DPR_CHEMRXIV_VECTOR_RPC_EXACT: match_chemrxiv_papers_exact
38+
DPR_CHEMRXIV_BM25_RPC: match_chemrxiv_papers_bm25
39+
PYTHONUNBUFFERED: "1"
40+
41+
steps:
42+
- name: Checkout
43+
uses: actions/checkout@v5
44+
45+
- name: Setup Python
46+
uses: actions/setup-python@v6
47+
with:
48+
python-version: "3.11"
49+
50+
- name: Cache pip + torch
51+
uses: actions/cache@v5
52+
with:
53+
path: |
54+
~/.cache/pip
55+
~/.cache/torch
56+
key: ${{ runner.os }}-dpr-chemrxiv-hf-v1-${{ hashFiles('requirements.txt') }}
57+
58+
- name: Install deps (skip sqlite3)
59+
run: |
60+
python - <<'PY'
61+
import re
62+
lines = open("requirements.txt", "r", encoding="utf-8").read().splitlines()
63+
lines = [l for l in lines if l.strip() and not re.match(r"^sqlite3\\b", l)]
64+
open("/tmp/req.txt", "w", encoding="utf-8").write("\n".join(lines))
65+
PY
66+
python -m pip install --upgrade pip
67+
python -m pip install uv
68+
uv pip install --system -r /tmp/req.txt
69+
70+
- name: Run ChemRxiv Maintain Pipeline
71+
run: |
72+
set -euo pipefail
73+
set -x
74+
75+
if [ -z "${SUPABASE_SERVICE_KEY}" ]; then
76+
echo "[WARN] 未配置 SUPABASE_SERVICE_KEY,已跳过 ChemRxiv 维护同步。"
77+
exit 0
78+
fi
79+
80+
FETCH_DAYS="${{ github.event.inputs.fetch_days }}"
81+
FORCE_FULL_WINDOW="${{ github.event.inputs.force_full_window }}"
82+
if [ -z "$FETCH_DAYS" ]; then
83+
FETCH_DAYS="400"
84+
fi
85+
86+
ARGS=(--fetch-days "$FETCH_DAYS")
87+
if [ "${FORCE_FULL_WINDOW:-false}" = "true" ]; then
88+
ARGS+=(--force-full-window)
89+
fi
90+
python src/maintain/chemrxiv.py "${ARGS[@]}"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
name: maintain-medrxiv
2+
3+
on:
4+
schedule:
5+
- cron: "15 1 * * *" # 北京时间 09:15
6+
workflow_dispatch:
7+
inputs:
8+
fetch_days:
9+
description: "回溯抓取天数(默认 30)"
10+
required: false
11+
default: "30"
12+
force_full_window:
13+
description: "是否忽略 seen 状态并全量回补当前窗口(true/false)"
14+
required: false
15+
default: "false"
16+
17+
permissions:
18+
contents: read
19+
20+
concurrency:
21+
group: daily-paper-reader-maintain-medrxiv
22+
cancel-in-progress: false
23+
24+
jobs:
25+
maintain_medrxiv:
26+
if: github.repository == 'ziwenhahaha/daily-paper-reader' || github.repository == '5-xj/daily-paper-reader'
27+
runs-on: ubuntu-latest
28+
timeout-minutes: 240
29+
env:
30+
SUPABASE_SERVICE_KEY: ${{ secrets.SUPABASE_SERVICE_KEY }}
31+
SUPABASE_BACKEND_KEY: medrxiv
32+
SUPABASE_PAPERS_TABLE: medrxiv_papers
33+
SUPABASE_RETENTION_DAYS: "45"
34+
DPR_ENABLE_MEDRXIV_BACKEND: "1"
35+
DPR_MEDRXIV_ENABLED: "1"
36+
DPR_MEDRXIV_PAPERS_TABLE: medrxiv_papers
37+
DPR_MEDRXIV_VECTOR_RPC_EXACT: match_medrxiv_papers_exact
38+
DPR_MEDRXIV_BM25_RPC: match_medrxiv_papers_bm25
39+
PYTHONUNBUFFERED: "1"
40+
41+
steps:
42+
- name: Checkout
43+
uses: actions/checkout@v5
44+
45+
- name: Setup Python
46+
uses: actions/setup-python@v6
47+
with:
48+
python-version: "3.11"
49+
50+
- name: Cache pip + torch
51+
uses: actions/cache@v5
52+
with:
53+
path: |
54+
~/.cache/pip
55+
~/.cache/torch
56+
key: ${{ runner.os }}-dpr-medrxiv-hf-v1-${{ hashFiles('requirements.txt') }}
57+
58+
- name: Install deps (skip sqlite3)
59+
run: |
60+
python - <<'PY'
61+
import re
62+
lines = open("requirements.txt", "r", encoding="utf-8").read().splitlines()
63+
lines = [l for l in lines if l.strip() and not re.match(r"^sqlite3\\b", l)]
64+
open("/tmp/req.txt", "w", encoding="utf-8").write("\n".join(lines))
65+
PY
66+
python -m pip install --upgrade pip
67+
python -m pip install uv
68+
uv pip install --system -r /tmp/req.txt
69+
70+
- name: Run medRxiv Maintain Pipeline
71+
run: |
72+
set -euo pipefail
73+
set -x
74+
75+
if [ -z "${SUPABASE_SERVICE_KEY}" ]; then
76+
echo "[WARN] 未配置 SUPABASE_SERVICE_KEY,已跳过 medRxiv 维护同步。"
77+
exit 0
78+
fi
79+
80+
FETCH_DAYS="${{ github.event.inputs.fetch_days }}"
81+
FORCE_FULL_WINDOW="${{ github.event.inputs.force_full_window }}"
82+
if [ -z "$FETCH_DAYS" ]; then
83+
FETCH_DAYS="30"
84+
fi
85+
86+
ARGS=(--fetch-days "$FETCH_DAYS")
87+
if [ "${FORCE_FULL_WINDOW:-false}" = "true" ]; then
88+
ARGS+=(--force-full-window)
89+
fi
90+
python src/maintain/medrxiv.py "${ARGS[@]}"

0 commit comments

Comments
 (0)