Skip to content

Enhance search pipeline with advanced configuration and API support #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
results/*
*.pyc
*.md
*.txt
!*.py
87 changes: 76 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,85 @@

from search_session import SearchSession


def load_config(config_path):
if not os.path.isfile(config_path):
return {}
with open(config_path, "r") as f:
return yaml.safe_load(f)


def main():
parser = argparse.ArgumentParser(description="Multi-step RAG pipeline with depth-limited searching.")
parser = argparse.ArgumentParser(
description="Multi-step RAG pipeline with depth-limited searching."
)
parser.add_argument("--query", type=str, required=True, help="Initial user query")
parser.add_argument("--config", type=str, default="config.yaml", help="Path to YAML configuration file")
parser.add_argument("--corpus_dir", type=str, default=None, help="Path to local corpus folder")
parser.add_argument("--device", type=str, default="cpu", help="Device for retrieval model (cpu or cuda)")
parser.add_argument("--retrieval_model", type=str, choices=["colpali", "all-minilm"], default="colpali")
parser.add_argument("--top_k", type=int, default=3, help="Number of local docs to retrieve")
parser.add_argument("--web_search", action="store_true", default=False, help="Enable web search")
parser.add_argument("--personality", type=str, default=None, help="Optional personality for Gemma (e.g. cheerful)")
parser.add_argument("--rag_model", type=str, default="gemma", help="Which model to use for final RAG steps")
parser.add_argument("--max_depth", type=int, default=1, help="Depth limit for subquery expansions")
parser.add_argument(
"--config",
type=str,
default="config.yaml",
help="Path to YAML configuration file",
)
parser.add_argument(
"--corpus_dir", type=str, default=None, help="Path to local corpus folder"
)
parser.add_argument(
"--device",
type=str,
default="cpu",
help="Device for retrieval model (cpu or cuda)",
)
parser.add_argument(
"--retrieval_model",
type=str,
choices=["colpali", "all-minilm"],
default="colpali",
)
parser.add_argument(
"--top_k", type=int, default=3, help="Number of local docs to retrieve"
)
parser.add_argument(
"--web_search", action="store_true", default=False, help="Enable web search"
)
parser.add_argument(
"--ddg_proxy",
type=str,
default=None,
help="Proxy for DuckDuckGo searches (format: http://user:pass@host:port)",
)
parser.add_argument(
"--personality",
type=str,
default=None,
help="Optional personality for Gemma (e.g. cheerful)",
)
parser.add_argument(
"--base_url",
type=str,
default="https://api.openai.com/v1",
help="Base URL for API (default: OpenAI official)",
)
parser.add_argument(
"--rag_model",
type=str,
default="gemma",
help="Model name (e.g. 'gemma', 'gpt-4-turbo', 'google/gemini-2.0-flash-001')",
)
parser.add_argument(
"--max_depth", type=int, default=1, help="Depth limit for subquery expansions"
)
parser.add_argument(
"--ollama_model",
type=str,
default="gemma2:2b",
help="Ollama model for non-final tasks (query enhancement, summarization)",
)
parser.add_argument(
"--max_context",
type=int,
default=24000, # ~16k tokens
help="Max context size in characters for final aggregation (default: 24000 ~16k tokens)",
)
args = parser.parse_args()

config = load_config(args.config)
Expand All @@ -39,7 +100,11 @@ def main():
web_search_enabled=args.web_search,
personality=args.personality,
rag_model=args.rag_model,
max_depth=args.max_depth
max_depth=args.max_depth,
base_url=args.base_url,
ddg_proxy=args.ddg_proxy,
ollama_model=args.ollama_model,
max_context=args.max_context,
)

loop = asyncio.get_event_loop()
Expand Down
29 changes: 19 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
# Core Dependencies
torch
transformers
sentence-transformers
numpy
torch>=2.3.0
transformers>=4.40.0
sentence-transformers>=3.0.0
numpy>=1.26.4
pyyaml

# Web and Parsing
aiohttp
duckduckgo_search
beautifulsoup4
pymupdf # PyMuPDF for PDF handling
aiohttp>=3.9.3
duckduckgo-search>=3.9.2
beautifulsoup4>=4.12.3
pymupdf>=1.24.0 # PyMuPDF (fitz)
pytesseract
Pillow # for image handling
Pillow>=10.3.0

# Optional LLM Integration
ollama
ollama>=0.1.14
openai>=1.30.1
tiktoken>=0.7.0 # For OpenAI token counting

# New packages
python-dotenv>=1.0.1

# Added from the code block
playwright>=1.46.0
html2text>=2020.1.16
Loading