-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_news_sources.py
More file actions
103 lines (80 loc) · 4 KB
/
test_news_sources.py
File metadata and controls
103 lines (80 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import logging
logging.basicConfig(level=logging.INFO)
def test_yahoo_news_search():
"""Test different Yahoo Finance URLs for news content"""
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
# Try different URL patterns
urls_to_test = [
'https://finance.yahoo.com/news/',
'https://finance.yahoo.com/topic/stock-market-news/',
'https://finance.yahoo.com/search?p=AAPL',
'https://finance.yahoo.com/quote/AAPL/news/',
'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-watched/',
]
for url in urls_to_test:
print(f"\n{'='*60}")
print(f"Testing URL: {url}")
print('='*60)
try:
response = session.get(url, timeout=15)
print(f"Status code: {response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
print(f"Page title: {soup.title.text if soup.title else 'No title'}")
# Look for article links
potential_articles = []
# Look for h2, h3 elements with links
headers = soup.find_all(['h2', 'h3'])
for header in headers[:10]:
link = header.find('a') or (header.parent if header.parent and header.parent.name == 'a' else None)
if link and hasattr(link, 'get'):
href = link.get('href', '')
text = header.get_text(strip=True)
if text and len(text) > 10:
potential_articles.append((href, text[:100]))
print(f"Found {len(potential_articles)} potential articles from headers")
for i, (href, text) in enumerate(potential_articles[:5]):
print(f" {i}: {href} - {text}")
# Look for specific article containers
article_containers = soup.find_all('div', class_=lambda x: x and any(kw in str(x).lower() for kw in ['story', 'article', 'item']))
print(f"Found {len(article_containers)} article containers")
except Exception as e:
print(f"Error: {e}")
def test_alternative_sources():
"""Test alternative news sources"""
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
# Try Google Finance news as alternative
print(f"\n{'='*60}")
print("Testing Google Finance for AAPL news")
print('='*60)
try:
url = 'https://www.google.com/finance/quote/AAPL:NASDAQ'
response = session.get(url, timeout=15)
print(f"Status code: {response.status_code}")
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
# Look for news sections
news_links = soup.find_all('a', href=True)
news_articles = []
for link in news_links:
href = link.get('href', '')
text = link.get_text(strip=True)
if text and len(text) > 20 and any(kw in text.lower() for kw in ['apple', 'aapl']):
news_articles.append((href, text[:100]))
print(f"Found {len(news_articles)} potential AAPL articles from Google Finance")
for i, (href, text) in enumerate(news_articles[:5]):
print(f" {i}: {href} - {text}")
except Exception as e:
print(f"Error with Google Finance: {e}")
if __name__ == "__main__":
test_yahoo_news_search()
test_alternative_sources()