-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy path2fetcher.py
More file actions
398 lines (329 loc) · 14.8 KB
/
2fetcher.py
File metadata and controls
398 lines (329 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse, unquote
from email.utils import parsedate_to_datetime
import re
import time
from datetime import datetime, timedelta
import http.client
http.client._MAXHEADERS = 1000
# Define headers and cookies
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.justice.gov/epstein/doj-disclosures",
"Origin": "https://www.justice.gov",
"Cookie": "justiceGovAgeVerified=true"
}
# List of wanted headers for file metadata
wanted_headers = [
'Date', 'Cache-Control', 'Content-Length', 'Etag', 'Expires',
'Last-Modified', 'Pragma', 'Server', 'Quic-Version',
'Content-Type', 'Alt-Svc', 'Accept-Ranges'
]
# Global counter for files processed
file_counter = 0
# Function to update or append to HASHES.txt
def update_hashes_file(basename, last_mod, etag):
hashes_file = 'HASHES.txt'
lines = []
updated = False
if os.path.exists(hashes_file):
with open(hashes_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
pattern = re.compile(rf'^{re.escape(basename)}:\s*(.*)$')
for i, line in enumerate(lines):
match = pattern.match(line.strip())
if match:
lines[i] = f"{basename}: {last_mod}, {etag}\n"
updated = True
break
if not updated:
# Append to file
with open(hashes_file, 'a', encoding='utf-8') as h:
h.write(f"{basename}: {last_mod}, {etag}\n")
else:
# Rewrite entire file with updated line
with open(hashes_file, 'w', encoding='utf-8') as h:
h.writelines(lines)
# Function to get existing hash data from HASHES.txt
def get_hash_data(basename):
"""Returns tuple (last_modified, etag) or (None, None) if not found"""
hashes_file = 'HASHES.txt'
if not os.path.exists(hashes_file):
return None, None
pattern = re.compile(rf'^{re.escape(basename)}:\s*(.*)$')
with open(hashes_file, 'r', encoding='utf-8') as f:
for line in f:
match = pattern.match(line.strip())
if match:
# Parse the data: "Last-Modified, Etag"
data = match.group(1)
parts = data.split(',', 1)
if len(parts) == 2:
return parts[0].strip(), parts[1].strip().strip('"')
return None, None
# Function to save page link every 50 files
def save_page_link(dataset_num, page):
with open('page_link.txt', 'w', encoding='utf-8') as f:
f.write(f"dataset={dataset_num}\n")
f.write(f"page={page}\n")
# Function to load saved page link
def load_page_link():
if not os.path.exists('page_link.txt'):
return None, None
try:
with open('page_link.txt', 'r', encoding='utf-8') as f:
lines = f.readlines()
dataset = None
page = None
for line in lines:
if line.startswith('dataset='):
dataset = int(line.split('=')[1].strip())
elif line.startswith('page='):
page = int(line.split('=')[1].strip())
return dataset, page
except:
return None, None
# Function to check if error already exists in deleted.txt
def is_in_deleted_file(error_line):
if not os.path.exists('deleted.txt'):
return False
with open('deleted.txt', 'r', encoding='utf-8') as f:
for line in f:
if line.strip() == error_line.strip():
return True
return False
# Function to append to deleted.txt if not already present
def append_to_deleted(error_line):
if not is_in_deleted_file(error_line):
with open('deleted.txt', 'a', encoding='utf-8') as f:
f.write(error_line + '\n')
# Function to check if file was processed and when
def check_file_processed(basename, dataset_num, reprocess_enabled):
"""
Returns tuple (processed, skip):
- processed: True if file exists
- skip: True if we should skip (exists and within 24h cooldown or reprocess disabled)
"""
txt_path = f"EFTA/DataSet_{dataset_num}/{basename}.txt"
if not os.path.exists(txt_path):
return False, False
# File exists, check if reprocessing is enabled
if not reprocess_enabled:
return True, True
# Check the Date header to determine when it was last processed
try:
with open(txt_path, 'r', encoding='utf-8') as f:
for line in f:
if line.startswith('Date:'):
date_str = line.split('Date:', 1)[1].strip()
processed_date = parsedate_to_datetime(date_str)
now = datetime.now(processed_date.tzinfo)
time_diff = now - processed_date
# If less than 24 hours, skip
if time_diff < timedelta(hours=24):
return True, True
else:
# More than 24 hours, allow reprocessing
return True, False
except:
# If we can't read the date, allow reprocessing
return True, False
# No Date header found, allow reprocessing
return True, False
# Function to check if important data has changed
def has_data_changed(basename, new_last_mod, new_etag):
"""Check if Etag or Last-Modified has changed"""
old_last_mod, old_etag = get_hash_data(basename)
if old_last_mod is None and old_etag is None:
# No existing data
return True
# Compare Etag and Last-Modified
if new_etag != old_etag or new_last_mod != old_last_mod:
return True
return False
# Interactive start
print("Welcome to the Epstein Files Archiver.")
print("Options:")
print(" 'normal' - Start from dataset 1, page 0")
print(" 'custom' - Specify dataset and page")
print(" 'resume' - Resume from saved page_link.txt")
choice = input("Enter your choice: ").strip().lower()
# Ask about reprocessing
reprocess_input = input("Enable reprocessing after 24h cooldown? (yes/no, default: yes): ").strip().lower()
reprocess_enabled = True if reprocess_input in ['', 'yes', 'y'] else False
if reprocess_enabled:
print("Reprocessing enabled: Files will be rechecked after 24 hours.")
else:
print("Reprocessing disabled: Already processed files will be skipped permanently.")
if choice == 'normal':
dataset_num = 1
start_page = 0
print("Starting normal run from dataset 1, page 0.")
elif choice == 'custom':
try:
dataset_num = int(input("Enter starting dataset number: "))
start_page = int(input("Enter starting page number for that dataset: "))
print(f"Starting from dataset {dataset_num}, page {start_page}.")
except ValueError:
print("Invalid input. Starting normal run from dataset 1.")
dataset_num = 1
start_page = 0
elif choice == 'resume':
dataset_num, start_page = load_page_link()
if dataset_num is None or start_page is None:
print("No saved progress found in page_link.txt. Starting from dataset 1, page 0.")
dataset_num = 1
start_page = 0
else:
print(f"Resuming from saved progress: dataset {dataset_num}, page {start_page}.")
else:
print("Invalid choice. Starting normal run from dataset 1.")
dataset_num = 1
start_page = 0
while True:
base_url = f"https://www.justice.gov/epstein/doj-disclosures/data-set-{dataset_num}-files"
# For the starting page if resuming
page = start_page
current_url = base_url if page == 0 else f"{base_url}?page={page}"
# Fetch the first page (or resumed page)
resp = requests.get(current_url, headers=headers, allow_redirects=False)
if resp.status_code == 403:
# Save the 403 response
os.makedirs('unreleased-datasets', exist_ok=True)
html_path = f"unreleased-datasets/data-set-{dataset_num}-files.html"
txt_path = f"unreleased-datasets/data-set-{dataset_num}-files.txt"
with open(html_path, 'w', encoding='utf-8') as f:
f.write(resp.text)
headers_txt = f"Status-Code: {resp.status_code}\n"
sorted_resp_headers = sorted(resp.headers.items(), key=lambda x: x[0])
for k, v in sorted_resp_headers:
headers_txt += f"{k}: {v}\n"
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(headers_txt)
# Go to next dataset
dataset_num += 1
# Reset page to 0 for next dataset
start_page = 0
continue
elif resp.status_code == 404:
# Save the 404 response with fixed name as specified
os.makedirs('unreleased-datasets', exist_ok=True)
html_path = "unreleased-datasets/404.html"
txt_path = "unreleased-datasets/404.txt"
with open(html_path, 'w', encoding='utf-8') as f:
f.write(resp.text)
headers_txt = f"Status-Code: {resp.status_code}\n"
sorted_resp_headers = sorted(resp.headers.items(), key=lambda x: x[0])
for k, v in sorted_resp_headers:
headers_txt += f"{k}: {v}\n"
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(headers_txt)
# Stop the script
break
elif resp.status_code != 200:
# Unexpected status, log and stop
print(f"Unexpected status {resp.status_code} for {current_url}. Stopping.")
break
# Now enter the page loop, starting from the fetched page
while True:
# Parse the current page
soup = BeautifulSoup(resp.text, 'html.parser')
# Find all file links in views-field-title
file_links = []
for field in soup.find_all('div', class_='views-field views-field-title'):
a = field.find('a')
if a and 'href' in a.attrs:
link = a['href']
full_link = 'https://www.justice.gov' + link if link.startswith('/') else link
file_links.append(full_link)
# Track how many files were actually processed on this page
files_processed_on_page = 0
# Process each file link
for link in file_links:
# Extract basename first to check if already processed
path = urlparse(link).path
basename = unquote(path.split('/')[-1])
# Check if file was already processed
processed, skip = check_file_processed(basename, dataset_num, reprocess_enabled)
if skip:
print(f"Skipping {basename} (already processed within 24h or reprocess disabled)")
continue
# Use HEAD to get headers without downloading body
file_resp = requests.head(link, headers=headers, allow_redirects=False)
if 300 <= file_resp.status_code < 400:
# Redirect detected: pause and log
print(f"Redirect detected for {link} (status: {file_resp.status_code})")
with open('redirects.log', 'a', encoding='utf-8') as f:
f.write(f"URL: {link}\n")
f.write(f"Status: {file_resp.status_code}\n")
f.write(f"Location: {file_resp.headers.get('Location', 'N/A')}\n")
f.write(f"Request Headers: {file_resp.request.headers}\n\n")
input("Press Enter to continue...")
continue # Skip saving headers for redirects
if file_resp.status_code != 200:
# Log to deleted.txt with duplicate checking
error_line = f"Failed to fetch headers for {link}: {file_resp.status_code}"
print(error_line)
append_to_deleted(error_line)
continue
# Extract metadata
last_mod = file_resp.headers.get('Last-Modified')
etag = file_resp.headers.get('Etag', 'N/A').strip('"')
# If file was processed before, check if data changed
if processed:
if not has_data_changed(basename, last_mod, etag):
print(f"Skipping {basename} (no changes in Etag or Last-Modified)")
continue
else:
print(f"Reprocessing {basename} (data changed)")
# Create directory
dir_path = f"EFTA/DataSet_{dataset_num}"
os.makedirs(dir_path, exist_ok=True)
# Path for txt file
txt_path = os.path.join(dir_path, f"{basename}.txt")
# Collect and sort wanted headers (only if present)
present_headers = {k: file_resp.headers[k] for k in wanted_headers if k in file_resp.headers}
sorted_headers = sorted(present_headers.items(), key=lambda x: x[0])
# Write to txt
with open(txt_path, 'w', encoding='utf-8') as f:
f.write(f"Status-Code: {file_resp.status_code}\n")
for k, v in sorted_headers:
f.write(f"{k}: {v}\n")
# Set file modified time based on Last-Modified (GMT to timestamp)
if last_mod:
dt = parsedate_to_datetime(last_mod)
timestamp = dt.timestamp()
os.utime(txt_path, (timestamp, timestamp)) # Sets mtime to GMT equivalent; system will display in EST
# Update HASHES.txt (will update if exists, else append)
update_hashes_file(basename, last_mod, f'"{etag}"')
# Increment counters
file_counter += 1
files_processed_on_page += 1
# Save page link every 50 files
if file_counter % 50 == 0:
save_page_link(dataset_num, page)
print(f"Progress saved: {file_counter} files processed (Dataset {dataset_num}, Page {page})")
# Check if we processed any files on this page
if files_processed_on_page == 0:
print(f"No new files to process on page {page}, moving to next page.")
# Check for next page
next_link = soup.find('a', class_='usa-pagination__next-page')
if not next_link:
break
# Fetch next page
page += 1
next_url = f"{base_url}?page={page}"
resp = requests.get(next_url, headers=headers, allow_redirects=False)
if resp.status_code != 200:
print(f"Failed to fetch page {page} for dataset {dataset_num}: {resp.status_code}")
break
# If we completed the dataset (or broke due to page fail), move to next
dataset_num += 1
# Reset page to 0 for next dataset
start_page = 0
page = 0
print("Script completed.")