Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions akd/agents/search/deep_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
LinkRelevancyAssessorInputSchema,
)
from akd.tools.scrapers import (
PDFScraperInputSchema,
ScraperToolInputSchema,
SimplePDFScraper,
SimpleWebScraper,
Expand Down Expand Up @@ -464,13 +465,19 @@ async def _execute_searches(
logger.debug(
f"Relevancy assessment summary: {assessment_output.assessment_summary}",
)
if self.web_scraper and assessment_output.filtered_results:
assessment_output.filtered_results = (
await self._fetch_full_content_for_high_relevancy(
assessment_output.filtered_results,
)
)
return assessment_output.filtered_results
except Exception as e:
logger.warning(f"Error in relevancy assessment: {e}")

# Fetch full content for high-relevancy results if enabled
if self.web_scraper and all_results:
all_results = await self._fetch_full_content_for_high_relevancy(all_results)
# if self.web_scraper and all_results:
# all_results = await self._fetch_full_content_for_high_relevancy(all_results)

return all_results

Expand All @@ -496,7 +503,7 @@ async def _fetch_full_content_for_high_relevancy(
# Try PDF first if available
if hasattr(result, "pdf_url") and result.pdf_url and self.pdf_scraper:
try:
pdf_input = ScraperToolInputSchema(url=str(result.pdf_url))
pdf_input = PDFScraperInputSchema(url=str(result.pdf_url))
pdf_content = await self.pdf_scraper.arun(pdf_input)
if pdf_content.content and len(pdf_content.content) > 500:
result.content = pdf_content.content
Expand Down
5 changes: 4 additions & 1 deletion akd/tools/scrapers/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,10 @@ async def _download_pdf_from_url(
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
headers = self.headers
try:
async with httpx.AsyncClient(timeout=self.timeout) as client:
async with httpx.AsyncClient(
timeout=self.timeout,
follow_redirects=True,
) as client:
async with client.stream("GET", url, headers=headers) as response:
response.raise_for_status()

Expand Down