-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
127 lines (111 loc) · 4.52 KB
/
main.py
File metadata and controls
127 lines (111 loc) · 4.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import hashlib
import requests
import csv
import os
import re
from datetime import datetime
SAVE_SNAPSHOT = True # Set to False if you don't want to save snapshots
PAGES_FILE = "pages.txt"
CHECKSUM_FILE = "checksum.csv"
def load_urls(filename):
urls = []
with open(filename, "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
if ";" in line:
url, ignore_file = line.split(";", 1)
urls.append((url.strip(), ignore_file.strip()))
else:
urls.append((line, None))
return urls
def load_checksums(filename):
checksums = {}
if os.path.exists(filename):
with open(filename, newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
checksums[row[0]] = (row[1], row[2])
return checksums
def save_checksums(filename, checksums):
with open(filename, "w", newline='') as csvfile:
writer = csv.writer(csvfile)
for url, (checksum, date) in checksums.items():
writer.writerow([url, checksum, date])
def clean_html(html):
# Remove query parameters from all image URLs (jpg, png, gif, etc.)
html = re.sub(rb'(\.(jpg|png|gif|jpeg))\?\d+', rb'\1', html, flags=re.IGNORECASE)
# Remove hidden ASP.NET fields that change every request
html = re.sub(rb'<input[^>]+name="__VIEWSTATE"[^>]*>', b'', html, flags=re.IGNORECASE)
html = re.sub(rb'<input[^>]+name="__VIEWSTATEGENERATOR"[^>]*>', b'', html, flags=re.IGNORECASE)
html = re.sub(rb'<input[^>]+name="__EVENTVALIDATION"[^>]*>', b'', html, flags=re.IGNORECASE)
# Remove all <script>...</script> blocks
html = re.sub(rb'<script.*?>.*?</script>', b'', html, flags=re.DOTALL|re.IGNORECASE)
# Remove all HTML comments
html = re.sub(rb'<!--.*?-->', b'', html, flags=re.DOTALL)
# Normalize whitespace
html = re.sub(rb'\s+', b' ', html)
return html.strip()
def remove_ignore_patterns(content, ignore_file):
if not ignore_file:
return content
ignore_path = os.path.join("ignore", ignore_file)
if not os.path.exists(ignore_path):
return content
with open(ignore_path, "r") as f:
for pattern in f:
pattern = pattern.strip()
if not pattern or pattern.startswith("#"):
continue
content = re.sub(pattern.encode(), b'', content)
return content
def get_checksum(content):
return hashlib.sha256(content).hexdigest()
def save_snapshot(new_checksum, cleaned_content, old_checksum, original_content, url):
snapshot_path = os.path.join("snapshots", f"{new_checksum}.html")
with open(snapshot_path, "wb") as f:
f.write(cleaned_content)
snapshot_path = os.path.join("snapshots", f"{new_checksum}.orig.html")
with open(snapshot_path, "wb") as f:
f.write(original_content)
# Log the change
log_path = os.path.join("snapshots", "snapshots.log")
timestamp = datetime.now().isoformat()
with open(log_path, "a") as log_file:
log_file.write(f"[{timestamp}] PREV:{old_checksum} - NEW:{new_checksum} - URL: {url}\n")
def main():
print("Fetching and processing web pages...")
urls = load_urls(PAGES_FILE)
old_checksums = load_checksums(CHECKSUM_FILE)
new_checksums = {}
changed_urls = []
os.makedirs("snapshots", exist_ok=True)
for url, ignore_file in urls:
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
cleaned_content = clean_html(resp.content)
cleaned_content = remove_ignore_patterns(cleaned_content, ignore_file)
checksum = get_checksum(cleaned_content)
old_checksum, old_date = old_checksums.get(url, (None, None))
if old_checksum != checksum:
# New page or changed page
date = datetime.now().isoformat()
changed_urls.append((url, date))
if SAVE_SNAPSHOT:
save_snapshot(checksum, cleaned_content, old_checksum, resp.content, url)
else:
date = old_date
new_checksums[url] = (checksum, date)
except Exception as e:
print(f"Error fetching {url}: {e}")
if changed_urls:
print("Changed URLs since last snapshot:")
for url, date in changed_urls:
print(f"{url} (changed at {date})")
else:
print("No changes detected.")
save_checksums(CHECKSUM_FILE, new_checksums)
if __name__ == "__main__":
main()