|
| 1 | +# tests/test_pdf_redirects.py |
| 2 | +""" |
| 3 | +Test all pg elem facs attrib urls redirect to a pdf. |
| 4 | +""" |
| 5 | +from glob import glob |
| 6 | +from pyriksdagen.io import parse_tei |
| 7 | +from pathlib import Path |
| 8 | +from tqdm import tqdm |
| 9 | +from trainerlog import get_logger |
| 10 | +from urllib.parse import urljoin |
| 11 | + |
| 12 | +import pandas as pd |
| 13 | +import re |
| 14 | +import requests |
| 15 | +import unittest |
| 16 | + |
| 17 | + |
| 18 | + |
| 19 | +LOGGER = get_logger("url test") |
| 20 | +FAILURE_LOG = Path("test/results/pdf_redirect_failures.tsv") |
| 21 | + |
| 22 | + |
| 23 | +REDIRECT_PATTERNS = [ |
| 24 | + re.compile(r'window\.location(?:\.href)?\s*=\s*[\'"]([^\'"]+)[\'"]'), |
| 25 | + re.compile(r'location\.href\s*=\s*[\'"]([^\'"]+)[\'"]'), |
| 26 | + re.compile(r'location\.replace\([\'"]([^\'"]+)[\'"]\)'), |
| 27 | + re.compile( |
| 28 | + r'<meta[^>]+http-equiv=["\']refresh["\'][^>]+url=([^"\'>]+)', |
| 29 | + re.I, |
| 30 | + ), |
| 31 | +] |
| 32 | + |
| 33 | + |
| 34 | + |
| 35 | +def extract_redirect_target(html: str, base_url: str) -> str | None: |
| 36 | + for pattern in REDIRECT_PATTERNS: |
| 37 | + match = pattern.search(html) |
| 38 | + |
| 39 | + if match: |
| 40 | + return urljoin(base_url, match.group(1).strip()) |
| 41 | + |
| 42 | + return None |
| 43 | + |
| 44 | + |
| 45 | +def check_pdf_redirect(source_url: str, |
| 46 | + session: requests.Session, |
| 47 | + ) -> tuple[bool, str, str | None]: |
| 48 | + try: |
| 49 | + # 1. Fetch redirect page |
| 50 | + r = session.get(source_url, timeout=15) |
| 51 | + |
| 52 | + if r.status_code != 200: |
| 53 | + return ( |
| 54 | + False, |
| 55 | + f"source returned HTTP {r.status_code}", |
| 56 | + None, |
| 57 | + ) |
| 58 | + |
| 59 | + # 2. Extract JS/meta redirect |
| 60 | + target_url = extract_redirect_target(r.text, source_url) |
| 61 | + |
| 62 | + if not target_url: |
| 63 | + return ( |
| 64 | + False, |
| 65 | + "no JS/meta redirect target found", |
| 66 | + None, |
| 67 | + ) |
| 68 | + |
| 69 | + # 3. Probe PDF without downloading whole file |
| 70 | + p = session.get( |
| 71 | + target_url, |
| 72 | + headers={"Range": "bytes=0-4"}, |
| 73 | + stream=True, |
| 74 | + timeout=20, |
| 75 | + ) |
| 76 | + |
| 77 | + if p.status_code not in {200, 206}: |
| 78 | + return ( |
| 79 | + False, |
| 80 | + f"target returned HTTP {p.status_code}", |
| 81 | + target_url, |
| 82 | + ) |
| 83 | + |
| 84 | + first_bytes = next(p.iter_content(chunk_size=5), b"") |
| 85 | + |
| 86 | + if first_bytes != b"%PDF-": |
| 87 | + return ( |
| 88 | + False, |
| 89 | + f"target does not start with %PDF-: {first_bytes!r}", |
| 90 | + target_url, |
| 91 | + ) |
| 92 | + |
| 93 | + return ( |
| 94 | + True, |
| 95 | + "ok", |
| 96 | + target_url, |
| 97 | + ) |
| 98 | + |
| 99 | + except requests.RequestException as e: |
| 100 | + return ( |
| 101 | + False, |
| 102 | + f"{type(e).__name__}: {e}", |
| 103 | + None, |
| 104 | + ) |
| 105 | + |
| 106 | + |
| 107 | + |
| 108 | + |
| 109 | + |
| 110 | +class TestPdfRedirects(unittest.TestCase): |
| 111 | + |
| 112 | + @classmethod |
| 113 | + def setUpClass(cls): |
| 114 | + def fetch_urls(): |
| 115 | + LOGGER.info("Loading URL data.") |
| 116 | + motions = sorted(glob("data/*/*.xml")) |
| 117 | + motions = [m for m in motions if "reg" not in m] |
| 118 | + motions = [m for m in motions if "fort" not in m] |
| 119 | + for motion in tqdm(motions): |
| 120 | + root, ns = parse_tei(motion) |
| 121 | + pbs = root.findall(f".//{ns['tei_ns']}pb") |
| 122 | + for pb in pbs: |
| 123 | + if "facs" in pb.attrib: |
| 124 | + url = pb.attrib["facs"] |
| 125 | + cls.urls.append((motion, url)) |
| 126 | + else: |
| 127 | + LOGGER.warning(f"pb w/o facs attrib in {motion}") |
| 128 | + cls.failures.append([motion, None, None, "pb w/o facs attrib"]) |
| 129 | + |
| 130 | + cls.session = requests.Session() |
| 131 | + cls.session.headers.update({ |
| 132 | + "User-Agent": "pdf-redirect-test/1.0" |
| 133 | + }) |
| 134 | + cls.urls = [] |
| 135 | + fetch_urls() |
| 136 | + cls.failures = [] |
| 137 | + |
| 138 | + @classmethod |
| 139 | + def tearDownClass(cls): |
| 140 | + failuredf = pd.DataFrame(cls.failures, columns=["motion", "src_url", "tgt_url", "Failure_type"], index=False) |
| 141 | + failuredf.to_csv(FAILURE_LOG, sep='\t') |
| 142 | + cls.session.close() |
| 143 | + |
| 144 | + |
| 145 | + |
| 146 | + |
| 147 | + |
| 148 | + |
| 149 | + def test_pdf_redirects(self): |
| 150 | + LOGGER.info(f"Testing {len(self.urls)} URLs") |
| 151 | + for motion, source_url in tqdm(self.urls): |
| 152 | + |
| 153 | + ok, message, target_url = check_pdf_redirect(source_url, |
| 154 | + self.session,) |
| 155 | + |
| 156 | + if not ok: |
| 157 | + LOGGER.warning(f"{source_url}, {target_url}, {message}") |
| 158 | + self.failures.append([motion, |
| 159 | + source_url, |
| 160 | + target_url, |
| 161 | + message,]) |
| 162 | + |
| 163 | + self.assertEqual(len(self.failures), 0, |
| 164 | + (f"{len(self.failures)} failures found. " |
| 165 | + f"See {FAILURE_LOG}"),) |
| 166 | + |
| 167 | + |
| 168 | + |
| 169 | + |
| 170 | +if __name__ == "__main__": |
| 171 | + unittest.main() |
0 commit comments