Skip to content

Add text extraction alternatives (newspaper3k, goose3) #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import CommandHandler, MessageHandler, CallbackQueryHandler, filters, ApplicationBuilder
from youtube_transcript_api import YouTubeTranscriptApi
from newspaper import Article
from goose3 import Goose

telegram_token = os.environ.get("TELEGRAM_TOKEN", "xxx")
model = os.environ.get("LLM_MODEL", "gpt-3.5-turbo-16k")
Expand All @@ -35,12 +37,33 @@ def scrape_text_from_url(url):
downloaded = trafilatura.fetch_url(url)
text = trafilatura.extract(downloaded, include_formatting=True)
if text is None:
return []
text_chunks = text.split("\n")
article_content = [text for text in text_chunks if text]
return article_content
raise ValueError("Trafilatura returned None")
except Exception as e:
print(f"Error: {e}")
print(f"Trafilatura failed: {e}")
try:
print("Trying newspaper3k...")
article = Article(url)
article.download()
article.parse()
text = article.text
if not text:
raise ValueError("Newspaper3k returned empty text")
except Exception as e:
print(f"Newspaper3k failed: {e}")
try:
print("Trying Goose3...")
g = Goose()
article = g.extract(url=url)
text = article.cleaned_text
if not text:
raise ValueError("Goose3 returned empty text")
except Exception as e:
print(f"Goose3 failed: {e}")
return []

text_chunks = text.split("\n")
article_content = [text for text in text_chunks if text]
return article_content

async def search_results(keywords):
print(keywords, ddg_region)
Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ litellm==1.37.9
# text extraction
trafilatura==1.9.0

# text extraction alternative
newspaper3k==0.2.8

# text extraction alternative
goose3==3.1.8

# duckduckgo
duckduckgo_search==5.3.0b4

Expand Down