diff --git a/main.py b/main.py index 732edb0..d083b2b 100644 --- a/main.py +++ b/main.py @@ -10,6 +10,8 @@ from telegram import InlineKeyboardButton, InlineKeyboardMarkup from telegram.ext import CommandHandler, MessageHandler, CallbackQueryHandler, filters, ApplicationBuilder from youtube_transcript_api import YouTubeTranscriptApi +from newspaper import Article +from goose3 import Goose telegram_token = os.environ.get("TELEGRAM_TOKEN", "xxx") model = os.environ.get("LLM_MODEL", "gpt-3.5-turbo-16k") @@ -35,12 +37,33 @@ def scrape_text_from_url(url): downloaded = trafilatura.fetch_url(url) text = trafilatura.extract(downloaded, include_formatting=True) if text is None: - return [] - text_chunks = text.split("\n") - article_content = [text for text in text_chunks if text] - return article_content + raise ValueError("Trafilatura returned None") except Exception as e: - print(f"Error: {e}") + print(f"Trafilatura failed: {e}") + try: + print("Trying newspaper3k...") + article = Article(url) + article.download() + article.parse() + text = article.text + if not text: + raise ValueError("Newspaper3k returned empty text") + except Exception as e: + print(f"Newspaper3k failed: {e}") + try: + print("Trying Goose3...") + g = Goose() + article = g.extract(url=url) + text = article.cleaned_text + if not text: + raise ValueError("Goose3 returned empty text") + except Exception as e: + print(f"Goose3 failed: {e}") + return [] + + text_chunks = text.split("\n") + article_content = [text for text in text_chunks if text] + return article_content async def search_results(keywords): print(keywords, ddg_region) diff --git a/requirements.txt b/requirements.txt index fc63e13..0a3cf71 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,12 @@ litellm==1.37.9 # text extraction trafilatura==1.9.0 +# text extraction alternative +newspaper3k==0.2.8 + +# text extraction alternative +goose3==3.1.8 + # duckduckgo duckduckgo_search==5.3.0b4