website-change-detector/main.py at main · epklein/website-change-detector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import hashlib
import requests
import csv
import os
import re
from datetime import datetime

SAVE_SNAPSHOT = True  # Set to False if you don't want to save snapshots
PAGES_FILE = "pages.txt"
CHECKSUM_FILE = "checksum.csv"

def load_urls(filename):
    urls = []
    with open(filename, "r") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if ";" in line:
                url, ignore_file = line.split(";", 1)
                urls.append((url.strip(), ignore_file.strip()))
            else:
                urls.append((line, None))
    return urls

def load_checksums(filename):
    checksums = {}
    if os.path.exists(filename):
        with open(filename, newline='') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                checksums[row[0]] = (row[1], row[2])
    return checksums

def save_checksums(filename, checksums):
    with open(filename, "w", newline='') as csvfile:
        writer = csv.writer(csvfile)
        for url, (checksum, date) in checksums.items():
            writer.writerow([url, checksum, date])

def clean_html(html):
    # Remove query parameters from all image URLs (jpg, png, gif, etc.)
    html = re.sub(rb'(\.(jpg|png|gif|jpeg))\?\d+', rb'\1', html, flags=re.IGNORECASE)
    # Remove hidden ASP.NET fields that change every request
    html = re.sub(rb'<input[^>]+name="__VIEWSTATE"[^>]*>', b'', html, flags=re.IGNORECASE)
    html = re.sub(rb'<input[^>]+name="__VIEWSTATEGENERATOR"[^>]*>', b'', html, flags=re.IGNORECASE)
    html = re.sub(rb'<input[^>]+name="__EVENTVALIDATION"[^>]*>', b'', html, flags=re.IGNORECASE)
    # Remove all <script>...</script> blocks
    html = re.sub(rb'<script.*?>.*?</script>', b'', html, flags=re.DOTALL|re.IGNORECASE)
    # Remove all HTML comments
    html = re.sub(rb'<!--.*?-->', b'', html, flags=re.DOTALL)
    # Normalize whitespace
    html = re.sub(rb'\s+', b' ', html)
    return html.strip()

def remove_ignore_patterns(content, ignore_file):
    if not ignore_file:
        return content
    ignore_path = os.path.join("ignore", ignore_file)
    if not os.path.exists(ignore_path):
        return content
    with open(ignore_path, "r") as f:
        for pattern in f:
            pattern = pattern.strip()
            if not pattern or pattern.startswith("#"):
                continue
            content = re.sub(pattern.encode(), b'', content)
    return content

def get_checksum(content):
    return hashlib.sha256(content).hexdigest()

def save_snapshot(new_checksum, cleaned_content, old_checksum, original_content, url):
    snapshot_path = os.path.join("snapshots", f"{new_checksum}.html")
    with open(snapshot_path, "wb") as f:
        f.write(cleaned_content)
    snapshot_path = os.path.join("snapshots", f"{new_checksum}.orig.html")
    with open(snapshot_path, "wb") as f:
        f.write(original_content)
    # Log the change
    log_path = os.path.join("snapshots", "snapshots.log")
    timestamp = datetime.now().isoformat()
    with open(log_path, "a") as log_file:
        log_file.write(f"[{timestamp}] PREV:{old_checksum} - NEW:{new_checksum} - URL: {url}\n")

def main():

    print("Fetching and processing web pages...")

    urls = load_urls(PAGES_FILE)
    old_checksums = load_checksums(CHECKSUM_FILE)
    new_checksums = {}
    changed_urls = []

    os.makedirs("snapshots", exist_ok=True)

    for url, ignore_file in urls:
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
            cleaned_content = clean_html(resp.content)
            cleaned_content = remove_ignore_patterns(cleaned_content, ignore_file)
            checksum = get_checksum(cleaned_content)
            old_checksum, old_date = old_checksums.get(url, (None, None))
            if old_checksum != checksum:
                # New page or changed page
                date = datetime.now().isoformat()
                changed_urls.append((url, date))
                if SAVE_SNAPSHOT:
                    save_snapshot(checksum, cleaned_content, old_checksum, resp.content, url)
            else:
                date = old_date
            new_checksums[url] = (checksum, date)
        except Exception as e:
            print(f"Error fetching {url}: {e}")

    if changed_urls:
        print("Changed URLs since last snapshot:")
        for url, date in changed_urls:
            print(f"{url} (changed at {date})")
    else:
        print("No changes detected.")

    save_checksums(CHECKSUM_FILE, new_checksums)

if __name__ == "__main__":
    main()