Skip to content

Commit f0baed4

Browse files
committed
feat: add facs url + redirect test
1 parent 7cbd819 commit f0baed4

1 file changed

Lines changed: 171 additions & 0 deletions

File tree

test/test-facs.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# tests/test_pdf_redirects.py
2+
"""
3+
Test all pg elem facs attrib urls redirect to a pdf.
4+
"""
5+
from glob import glob
6+
from pyriksdagen.io import parse_tei
7+
from pathlib import Path
8+
from tqdm import tqdm
9+
from trainerlog import get_logger
10+
from urllib.parse import urljoin
11+
12+
import pandas as pd
13+
import re
14+
import requests
15+
import unittest
16+
17+
18+
19+
LOGGER = get_logger("url test")
20+
FAILURE_LOG = Path("test/results/pdf_redirect_failures.tsv")
21+
22+
23+
REDIRECT_PATTERNS = [
24+
re.compile(r'window\.location(?:\.href)?\s*=\s*[\'"]([^\'"]+)[\'"]'),
25+
re.compile(r'location\.href\s*=\s*[\'"]([^\'"]+)[\'"]'),
26+
re.compile(r'location\.replace\([\'"]([^\'"]+)[\'"]\)'),
27+
re.compile(
28+
r'<meta[^>]+http-equiv=["\']refresh["\'][^>]+url=([^"\'>]+)',
29+
re.I,
30+
),
31+
]
32+
33+
34+
35+
def extract_redirect_target(html: str, base_url: str) -> str | None:
36+
for pattern in REDIRECT_PATTERNS:
37+
match = pattern.search(html)
38+
39+
if match:
40+
return urljoin(base_url, match.group(1).strip())
41+
42+
return None
43+
44+
45+
def check_pdf_redirect(source_url: str,
46+
session: requests.Session,
47+
) -> tuple[bool, str, str | None]:
48+
try:
49+
# 1. Fetch redirect page
50+
r = session.get(source_url, timeout=15)
51+
52+
if r.status_code != 200:
53+
return (
54+
False,
55+
f"source returned HTTP {r.status_code}",
56+
None,
57+
)
58+
59+
# 2. Extract JS/meta redirect
60+
target_url = extract_redirect_target(r.text, source_url)
61+
62+
if not target_url:
63+
return (
64+
False,
65+
"no JS/meta redirect target found",
66+
None,
67+
)
68+
69+
# 3. Probe PDF without downloading whole file
70+
p = session.get(
71+
target_url,
72+
headers={"Range": "bytes=0-4"},
73+
stream=True,
74+
timeout=20,
75+
)
76+
77+
if p.status_code not in {200, 206}:
78+
return (
79+
False,
80+
f"target returned HTTP {p.status_code}",
81+
target_url,
82+
)
83+
84+
first_bytes = next(p.iter_content(chunk_size=5), b"")
85+
86+
if first_bytes != b"%PDF-":
87+
return (
88+
False,
89+
f"target does not start with %PDF-: {first_bytes!r}",
90+
target_url,
91+
)
92+
93+
return (
94+
True,
95+
"ok",
96+
target_url,
97+
)
98+
99+
except requests.RequestException as e:
100+
return (
101+
False,
102+
f"{type(e).__name__}: {e}",
103+
None,
104+
)
105+
106+
107+
108+
109+
110+
class TestPdfRedirects(unittest.TestCase):
111+
112+
@classmethod
113+
def setUpClass(cls):
114+
def fetch_urls():
115+
LOGGER.info("Loading URL data.")
116+
motions = sorted(glob("data/*/*.xml"))
117+
motions = [m for m in motions if "reg" not in m]
118+
motions = [m for m in motions if "fort" not in m]
119+
for motion in tqdm(motions):
120+
root, ns = parse_tei(motion)
121+
pbs = root.findall(f".//{ns['tei_ns']}pb")
122+
for pb in pbs:
123+
if "facs" in pb.attrib:
124+
url = pb.attrib["facs"]
125+
cls.urls.append((motion, url))
126+
else:
127+
LOGGER.warning(f"pb w/o facs attrib in {motion}")
128+
cls.failures.append([motion, None, None, "pb w/o facs attrib"])
129+
130+
cls.session = requests.Session()
131+
cls.session.headers.update({
132+
"User-Agent": "pdf-redirect-test/1.0"
133+
})
134+
cls.urls = []
135+
fetch_urls()
136+
cls.failures = []
137+
138+
@classmethod
139+
def tearDownClass(cls):
140+
failuredf = pd.DataFrame(cls.failures, columns=["motion", "src_url", "tgt_url", "Failure_type"], index=False)
141+
failuredf.to_csv(FAILURE_LOG, sep='\t')
142+
cls.session.close()
143+
144+
145+
146+
147+
148+
149+
def test_pdf_redirects(self):
150+
LOGGER.info(f"Testing {len(self.urls)} URLs")
151+
for motion, source_url in tqdm(self.urls):
152+
153+
ok, message, target_url = check_pdf_redirect(source_url,
154+
self.session,)
155+
156+
if not ok:
157+
LOGGER.warning(f"{source_url}, {target_url}, {message}")
158+
self.failures.append([motion,
159+
source_url,
160+
target_url,
161+
message,])
162+
163+
self.assertEqual(len(self.failures), 0,
164+
(f"{len(self.failures)} failures found. "
165+
f"See {FAILURE_LOG}"),)
166+
167+
168+
169+
170+
if __name__ == "__main__":
171+
unittest.main()

0 commit comments

Comments
 (0)