school-strategist/scrape_pta.py at main · snehsach19/school-strategist · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
from datetime import datetime, timedelta
from pathlib import Path

import requests
from bs4 import BeautifulSoup

PTA_URL = "https://losalamitospta.membershiptoolkit.com/home"
DATA_DIR = Path("data")
PTA_FILE = DATA_DIR / "pta_page.json"
CACHE_HOURS = 24


def is_cache_fresh():
    """Check if cached PTA data is less than 24 hours old."""
    if not PTA_FILE.exists():
        return False

    with open(PTA_FILE) as f:
        data = json.load(f)

    scraped_at = datetime.fromisoformat(data.get("scraped_at", "2000-01-01"))
    return datetime.now() - scraped_at < timedelta(hours=CACHE_HOURS)


def fetch_pta_page(url):
    """Fetch the PTA homepage HTML."""
    response = requests.get(url, timeout=30)
    response.raise_for_status()
    return response.text


def extract_text(html):
    """Extract main content text from HTML, stripping nav/footer/scripts."""
    soup = BeautifulSoup(html, "html.parser")

    # Remove non-content elements
    for tag in soup.find_all(["script", "style", "nav", "footer", "header", "noscript"]):
        tag.decompose()

    # Try to find main content area
    main = soup.find("main") or soup.find("div", {"role": "main"}) or soup.find("body")
    if main is None:
        main = soup

    text = main.get_text(separator="\n", strip=True)

    # Collapse excessive blank lines
    lines = [line for line in text.splitlines() if line.strip()]
    return "\n".join(lines)


def extract_images(html, base_url):
    """Extract image URLs from HTML, focusing on event flyers."""
    soup = BeautifulSoup(html, "html.parser")

    images = []
    for img in soup.find_all("img"):
        src = img.get("src", "")
        alt = img.get("alt", "")

        # Skip tiny images (icons, spacers)
        width = img.get("width", "")
        height = img.get("height", "")
        if width and width.isdigit() and int(width) < 50:
            continue
        if height and height.isdigit() and int(height) < 50:
            continue

        # Skip common non-flyer images
        skip_patterns = ["logo", "icon", "avatar", "profile", "spacer", "pixel"]
        if any(p in src.lower() or p in alt.lower() for p in skip_patterns):
            continue

        if src:
            # Make absolute URL if needed
            if src.startswith("//"):
                src = "https:" + src
            elif src.startswith("/"):
                from urllib.parse import urljoin
                src = urljoin(base_url, src)

            images.append({
                "url": src,
                "alt": alt,
            })

    return images


def save_pta_data(text, url, images=None):
    """Save scraped PTA data to JSON."""
    DATA_DIR.mkdir(exist_ok=True)

    data = {
        "source_url": url,
        "scraped_at": datetime.now().isoformat(),
        "text": text,
        "images": images or [],
    }

    with open(PTA_FILE, "w") as f:
        json.dump(data, f, indent=2)

    return PTA_FILE


def main(force=False):
    """Scrape PTA homepage with 24h cache."""
    if not force and is_cache_fresh():
        print(f"PTA cache is fresh (< {CACHE_HOURS}h old). Skipping scrape.")
        print(f"  Cached file: {PTA_FILE}")
        return

    print(f"Fetching PTA page: {PTA_URL}")
    html = fetch_pta_page(PTA_URL)

    print("Extracting text content...")
    text = extract_text(html)

    print("Extracting images...")
    images = extract_images(html, PTA_URL)

    print(f"Extracted {len(text)} characters of text and {len(images)} images")

    output = save_pta_data(text, PTA_URL, images)
    print(f"Saved to {output}")

    if images:
        print("Images found:")
        for img in images[:10]:  # Show first 10
            print(f"  - {img['alt'] or 'No alt'}: {img['url'][:80]}...")


if __name__ == "__main__":
    import sys
    force = "--force" in sys.argv or "-f" in sys.argv
    main(force=force)