Skip to content

Commit 85a5462

Browse files
committed
Modularize scoring and agent search
1 parent d264705 commit 85a5462

16 files changed

Lines changed: 715 additions & 375 deletions

.env.example

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,4 @@ CEREBRAS_API_BASE=https://api.cerebras.ai/v1
7878

7979
GEMINI_MAX_PAPERS=5
8080
LLM_RELEVANCE_THRESHOLD=6
81-
# Legacy knob from pre-0.7 batch reranking. Listwise reranking now uses LLM_MAX_CANDIDATES.
82-
LLM_BATCH_SIZE=5
8381
LLM_MAX_CANDIDATES=30

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,11 @@ Fallback:
199199
- Beginner (English): [docs/manuals/MANUAL_FIRSTTIME_EN.md](docs/manuals/MANUAL_FIRSTTIME_EN.md)
200200
- Full operations (English): [docs/manuals/MANUAL_EN.md](docs/manuals/MANUAL_EN.md)
201201
- Agent/tool integration (English): [docs/manuals/MANUAL_AGENT_EN.md](docs/manuals/MANUAL_AGENT_EN.md)
202+
- Scoring policy (English): [docs/manuals/SCORING_POLICY_EN.md](docs/manuals/SCORING_POLICY_EN.md)
202203
- Beginner (Korean): [docs/manuals/MANUAL_FIRSTTIME_KR.md](docs/manuals/MANUAL_FIRSTTIME_KR.md)
203204
- Full operations (Korean): [docs/manuals/MANUAL_KR.md](docs/manuals/MANUAL_KR.md)
204205
- Agent/tool integration (Korean): [docs/manuals/MANUAL_AGENT_KR.md](docs/manuals/MANUAL_AGENT_KR.md)
206+
- Scoring policy (Korean): [docs/manuals/SCORING_POLICY_KR.md](docs/manuals/SCORING_POLICY_KR.md)
205207
- Korean README: [docs/manuals/README_KR.md](docs/manuals/README_KR.md)
206208

207209
## Contact

app/agent_search.py

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import replace
4+
from datetime import datetime, timezone
5+
from typing import Any, Dict, List
6+
7+
from paper_digest_app import (
8+
AppConfig,
9+
DigestStats,
10+
Paper,
11+
ResearchProject,
12+
TopicProfile,
13+
build_diagnostics_lines,
14+
can_use_cerebras_fallback,
15+
can_use_openai_compat_provider,
16+
clean_text,
17+
coerce_bool,
18+
coerce_keyword_weights,
19+
collect_and_rank_papers,
20+
dedupe_list,
21+
generate_topics_from_projects,
22+
normalize_output_language,
23+
normalize_relevance_mode,
24+
resolve_search_request,
25+
LLM_RELEVANCE_MODE_DEFAULT,
26+
)
27+
28+
29+
def build_agent_projects_input(
30+
project_name: str,
31+
research_context: str,
32+
keywords: List[str],
33+
) -> List[Dict[str, str]]:
34+
merged_context = clean_text(research_context)
35+
normalized_keywords = [clean_text(str(item)) for item in keywords if clean_text(str(item))]
36+
if normalized_keywords:
37+
merged_context = (
38+
f"{merged_context} | Keywords: {', '.join(normalized_keywords)}"
39+
if merged_context
40+
else f"Keywords: {', '.join(normalized_keywords)}"
41+
)
42+
return [{"name": clean_text(project_name) or "Untitled project", "context": merged_context}]
43+
44+
45+
def build_topic_profiles_from_generated_topics(topics: List[Dict[str, Any]]) -> List[TopicProfile]:
46+
profiles: List[TopicProfile] = []
47+
for topic in topics:
48+
if not isinstance(topic, dict):
49+
continue
50+
name = clean_text(str(topic.get("name", "")))
51+
keyword_weights = coerce_keyword_weights(topic.get("keywords", []))
52+
relevance_mode = normalize_relevance_mode(
53+
topic.get("relevance_mode", LLM_RELEVANCE_MODE_DEFAULT)
54+
)
55+
if not name or not keyword_weights:
56+
continue
57+
profiles.append(
58+
TopicProfile(
59+
name=name,
60+
keywords=keyword_weights,
61+
relevance_mode=relevance_mode,
62+
)
63+
)
64+
return profiles
65+
66+
67+
def clone_config_for_agent_request(
68+
base_config: AppConfig,
69+
project_name: str,
70+
research_context: str,
71+
keywords: List[str],
72+
generated_topics: List[Dict[str, Any]],
73+
top_k: int,
74+
output_language: str | None = None,
75+
model: str | None = None,
76+
source_policy: Dict[str, Any] | None = None,
77+
) -> AppConfig:
78+
topic_profiles = build_topic_profiles_from_generated_topics(generated_topics)
79+
arxiv_queries = dedupe_list(
80+
[
81+
clean_text(str(topic.get("arxiv_query", "")))
82+
for topic in generated_topics
83+
if clean_text(str(topic.get("arxiv_query", "")))
84+
]
85+
)
86+
pubmed_queries = dedupe_list(
87+
[
88+
clean_text(str(topic.get("pubmed_query", "")))
89+
for topic in generated_topics
90+
if clean_text(str(topic.get("pubmed_query", "")))
91+
]
92+
)
93+
semantic_queries = dedupe_list(
94+
[
95+
clean_text(str(topic.get("semantic_scholar_query", "")))
96+
for topic in generated_topics
97+
if clean_text(str(topic.get("semantic_scholar_query", "")))
98+
]
99+
)
100+
google_queries = dedupe_list(
101+
[
102+
clean_text(str(topic.get("google_scholar_query", "")))
103+
for topic in generated_topics
104+
if clean_text(str(topic.get("google_scholar_query", "")))
105+
]
106+
)
107+
requested_output_language = normalize_output_language(
108+
output_language or base_config.output_language
109+
)
110+
normalized_keywords = [clean_text(str(item)) for item in keywords if clean_text(str(item))]
111+
source_policy = source_policy or {}
112+
use_arxiv = coerce_bool(source_policy.get("arxiv"), True)
113+
use_pubmed = coerce_bool(source_policy.get("pubmed"), True)
114+
use_semantic_scholar = coerce_bool(
115+
source_policy.get("semantic_scholar"),
116+
base_config.enable_semantic_scholar,
117+
)
118+
use_google_scholar = coerce_bool(
119+
source_policy.get("google_scholar"),
120+
base_config.enable_google_scholar,
121+
)
122+
requested_model = clean_text(model)
123+
return replace(
124+
base_config,
125+
research_projects=[
126+
ResearchProject(
127+
name=clean_text(project_name) or "Untitled project",
128+
context=build_agent_projects_input(
129+
project_name,
130+
research_context,
131+
normalized_keywords,
132+
)[0]["context"],
133+
send_frequency="daily",
134+
send_interval_days=1,
135+
)
136+
],
137+
topic_profiles=topic_profiles,
138+
arxiv_queries=arxiv_queries if use_arxiv else [],
139+
pubmed_queries=pubmed_queries if use_pubmed else [],
140+
semantic_scholar_queries=semantic_queries,
141+
google_scholar_queries=google_queries,
142+
enable_semantic_scholar=use_semantic_scholar,
143+
enable_google_scholar=use_google_scholar,
144+
max_papers=max(1, min(50, int(top_k))),
145+
output_language=requested_output_language,
146+
gemini_model=requested_model or base_config.gemini_model,
147+
openai_compat_model=requested_model or base_config.openai_compat_model,
148+
cerebras_model=requested_model or base_config.cerebras_model,
149+
)
150+
151+
152+
def map_agent_status(stats: DigestStats, papers: List[Paper]) -> str:
153+
if papers:
154+
return "ok"
155+
if stats.no_results_reason == "outside_horizon":
156+
return "outside_horizon"
157+
if stats.no_results_reason == "below_threshold":
158+
return "below_threshold"
159+
if stats.no_results_reason in {"none_retrieved", "no_candidates"}:
160+
return "no_candidates"
161+
return "error"
162+
163+
164+
def describe_agent_llm_backend(config: AppConfig) -> Dict[str, str]:
165+
if config.gemini_api_key:
166+
return {"provider": "gemini", "model": config.gemini_model}
167+
if can_use_openai_compat_provider(config):
168+
return {"provider": "openai_compatible", "model": config.openai_compat_model}
169+
if can_use_cerebras_fallback(config):
170+
return {"provider": "cerebras", "model": config.cerebras_model}
171+
return {"provider": "none", "model": ""}
172+
173+
174+
def search_papers_for_agent(
175+
config: AppConfig,
176+
project_name: str,
177+
research_context: str,
178+
keywords: List[str] | None = None,
179+
search_intent: str = "best_match",
180+
time_horizon_key: str = "1y",
181+
top_k: int = 10,
182+
output_language: str | None = None,
183+
model: str | None = None,
184+
include_diagnostics: bool = False,
185+
source_policy: Dict[str, Any] | None = None,
186+
) -> Dict[str, Any]:
187+
normalized_context = clean_text(research_context)
188+
if not normalized_context:
189+
raise ValueError("research_context is required.")
190+
191+
normalized_keywords = [
192+
clean_text(str(item)) for item in (keywords or []) if clean_text(str(item))
193+
]
194+
llm_projects = build_agent_projects_input(
195+
project_name,
196+
normalized_context,
197+
normalized_keywords,
198+
)
199+
generated_topics = generate_topics_from_projects(config, llm_projects)
200+
request_config = clone_config_for_agent_request(
201+
config,
202+
project_name=project_name,
203+
research_context=normalized_context,
204+
keywords=normalized_keywords,
205+
generated_topics=generated_topics,
206+
top_k=top_k,
207+
output_language=output_language,
208+
model=model,
209+
source_policy=source_policy,
210+
)
211+
search_request = resolve_search_request(
212+
request_config,
213+
search_intent=search_intent,
214+
time_horizon_key=time_horizon_key,
215+
)
216+
now_utc = datetime.now(timezone.utc)
217+
ranked, stats = collect_and_rank_papers(request_config, now_utc, search_request)
218+
papers = ranked[: request_config.max_papers]
219+
primary_topic = generated_topics[0] if generated_topics else {}
220+
backend = describe_agent_llm_backend(request_config)
221+
return {
222+
"status": map_agent_status(stats, papers),
223+
"request": {
224+
"project_name": clean_text(project_name) or llm_projects[0]["name"],
225+
"search_intent": search_request.intent,
226+
"time_horizon": search_request.time_horizon_key,
227+
"top_k": request_config.max_papers,
228+
"output_language": request_config.output_language,
229+
},
230+
"meta": {
231+
"intent_label": search_request.intent_label,
232+
"requested_horizon_label": search_request.time_horizon_label,
233+
"window_used_label": stats.window_used_label or search_request.time_horizon_label,
234+
"query_plan_label": stats.query_plan_label or "generated topic queries",
235+
"used_provider": backend["provider"],
236+
"used_model": backend["model"],
237+
"sources_queried": [
238+
label
239+
for enabled, label in [
240+
(bool(request_config.arxiv_queries), "arXiv"),
241+
(bool(request_config.pubmed_queries), "PubMed"),
242+
(
243+
request_config.enable_semantic_scholar
244+
and bool(request_config.semantic_scholar_queries),
245+
"Semantic Scholar",
246+
),
247+
(
248+
request_config.enable_google_scholar
249+
and bool(request_config.google_scholar_queries),
250+
"Google Scholar",
251+
),
252+
]
253+
if enabled
254+
],
255+
"scanned_count": stats.post_time_filter_candidates or stats.total_candidates,
256+
"selected_count": len(papers),
257+
"threshold_used": stats.ranking_threshold,
258+
"notice": stats.search_notice,
259+
},
260+
"topic": {
261+
"name": clean_text(str(primary_topic.get("name", ""))),
262+
"keywords": [
263+
clean_text(str(item))
264+
for item in primary_topic.get("keywords", [])
265+
if clean_text(str(item))
266+
],
267+
"relevance_mode": normalize_relevance_mode(
268+
primary_topic.get("relevance_mode", LLM_RELEVANCE_MODE_DEFAULT)
269+
),
270+
"arxiv_query": clean_text(str(primary_topic.get("arxiv_query", ""))),
271+
"pubmed_query": clean_text(str(primary_topic.get("pubmed_query", ""))),
272+
"semantic_scholar_query": clean_text(
273+
str(primary_topic.get("semantic_scholar_query", ""))
274+
),
275+
"google_scholar_query": clean_text(
276+
str(primary_topic.get("google_scholar_query", ""))
277+
),
278+
},
279+
"papers": [
280+
{
281+
"rank": index,
282+
"id": paper.paper_id,
283+
"title": paper.title,
284+
"authors": ", ".join(paper.authors),
285+
"source": paper.source,
286+
"url": paper.url,
287+
"published_at": paper.published_at_utc.isoformat(),
288+
"relevance_score": paper.score,
289+
"relevance_reason": paper.llm_relevance_text,
290+
"core_point": paper.llm_core_point_text,
291+
"usefulness": paper.llm_usefulness_text,
292+
"evidence_spans": list(paper.llm_evidence_spans or []),
293+
"topic": paper.topic,
294+
"project_name": paper.project_name,
295+
"relevance_mode": paper.relevance_mode,
296+
}
297+
for index, paper in enumerate(papers, start=1)
298+
],
299+
"diagnostics": build_diagnostics_lines(stats) if include_diagnostics else [],
300+
}

app/onboarding_wizard.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,6 @@ def write_env_file(path: Path, values: Dict[str, str]) -> None:
263263
f"CEREBRAS_API_BASE={values_to_write['CEREBRAS_API_BASE']}",
264264
f"GEMINI_MAX_PAPERS={values_to_write['GEMINI_MAX_PAPERS']}",
265265
f"LLM_RELEVANCE_THRESHOLD={values_to_write['LLM_RELEVANCE_THRESHOLD']}",
266-
f"LLM_BATCH_SIZE={values_to_write['LLM_BATCH_SIZE']}",
267266
f"LLM_MAX_CANDIDATES={values_to_write['LLM_MAX_CANDIDATES']}",
268267
]
269268
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
@@ -428,7 +427,6 @@ def main() -> int:
428427
"CEREBRAS_API_BASE": cerebras_api_base,
429428
"GEMINI_MAX_PAPERS": "5",
430429
"LLM_RELEVANCE_THRESHOLD": "6",
431-
"LLM_BATCH_SIZE": "5",
432430
"LLM_MAX_CANDIDATES": "30",
433431
}
434432

0 commit comments

Comments
 (0)