FinHubTasks/test_news_sources.py at master · Daniel0813/FinHubTasks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3

import requests
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.INFO)

def test_yahoo_news_search():
    """Test different Yahoo Finance URLs for news content"""

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })

    # Try different URL patterns
    urls_to_test = [
        'https://finance.yahoo.com/news/',
        'https://finance.yahoo.com/topic/stock-market-news/',
        'https://finance.yahoo.com/search?p=AAPL',
        'https://finance.yahoo.com/quote/AAPL/news/',
        'https://finance.yahoo.com/u/yahoo-finance/watchlists/most-watched/',
    ]

    for url in urls_to_test:
        print(f"\n{'='*60}")
        print(f"Testing URL: {url}")
        print('='*60)

        try:
            response = session.get(url, timeout=15)
            print(f"Status code: {response.status_code}")

            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                print(f"Page title: {soup.title.text if soup.title else 'No title'}")

                # Look for article links
                potential_articles = []

                # Look for h2, h3 elements with links
                headers = soup.find_all(['h2', 'h3'])
                for header in headers[:10]:
                    link = header.find('a') or (header.parent if header.parent and header.parent.name == 'a' else None)
                    if link and hasattr(link, 'get'):
                        href = link.get('href', '')
                        text = header.get_text(strip=True)
                        if text and len(text) > 10:
                            potential_articles.append((href, text[:100]))

                print(f"Found {len(potential_articles)} potential articles from headers")
                for i, (href, text) in enumerate(potential_articles[:5]):
                    print(f"  {i}: {href} - {text}")

                # Look for specific article containers
                article_containers = soup.find_all('div', class_=lambda x: x and any(kw in str(x).lower() for kw in ['story', 'article', 'item']))
                print(f"Found {len(article_containers)} article containers")

        except Exception as e:
            print(f"Error: {e}")

def test_alternative_sources():
    """Test alternative news sources"""

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })

    # Try Google Finance news as alternative
    print(f"\n{'='*60}")
    print("Testing Google Finance for AAPL news")
    print('='*60)

    try:
        url = 'https://www.google.com/finance/quote/AAPL:NASDAQ'
        response = session.get(url, timeout=15)
        print(f"Status code: {response.status_code}")

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Look for news sections
            news_links = soup.find_all('a', href=True)
            news_articles = []

            for link in news_links:
                href = link.get('href', '')
                text = link.get_text(strip=True)
                if text and len(text) > 20 and any(kw in text.lower() for kw in ['apple', 'aapl']):
                    news_articles.append((href, text[:100]))

            print(f"Found {len(news_articles)} potential AAPL articles from Google Finance")
            for i, (href, text) in enumerate(news_articles[:5]):
                print(f"  {i}: {href} - {text}")

    except Exception as e:
        print(f"Error with Google Finance: {e}")

if __name__ == "__main__":
    test_yahoo_news_search()
    test_alternative_sources()