|
3 | 3 | from dataclasses import dataclass |
4 | 4 | from html import escape |
5 | 5 | import re |
| 6 | +import sys |
6 | 7 | import uuid |
7 | 8 | from pathlib import Path |
8 | 9 | from typing import Iterable |
|
13 | 14 | from pygments.lexer import RegexLexer, words |
14 | 15 | from pygments.token import Comment, Keyword, Name, Number, Operator, Punctuation, String, Text |
15 | 16 |
|
| 17 | +# Path.relative_to(walk_up=True) |
| 18 | +assert not (tuple(sys.version_info) < (3,12)) |
16 | 19 |
|
17 | 20 | def BeautifulSoup(*args): |
18 | 21 | return bs4.BeautifulSoup(*args, features="lxml") |
@@ -266,40 +269,18 @@ def _decorate_title(self, soup) -> None: |
266 | 269 | def _normalize_urls(self, soup, public_path: str) -> None: |
267 | 270 | for tag, attr in (("a", "href"), ("img", "src"), ("script", "src"), ("link", "href")): |
268 | 271 | for element in soup.find_all(tag): |
269 | | - value = element.get(attr) |
270 | | - if not value: |
271 | | - continue |
272 | | - normalized = self._normalize_url(public_path, value) |
273 | | - if normalized is None: |
274 | | - continue |
275 | | - element[attr] = normalized |
| 272 | + if value := element.get(attr): |
| 273 | + if normalized := self._normalize_url(public_path, value): |
| 274 | + element[attr] = normalized |
276 | 275 |
|
277 | 276 | def _normalize_url(self, public_path: str, value: str) -> str | None: |
278 | | - if not value or value.startswith(("data:", "mailto:", "javascript:", "tel:", "#")): |
| 277 | + if not value or value.startswith(("data:", "mailto:", "javascript:", "tel:", "#", "http:", "https:")): |
279 | 278 | return None |
280 | | - |
281 | | - base = public_path |
282 | | - if not base.endswith("/"): |
283 | | - base = str(Path(base).parent).replace("\\", "/") |
284 | | - if not base.startswith("/"): |
285 | | - base = "/" + base |
286 | | - if base == "/.": |
287 | | - base = "/" |
288 | | - if not base.endswith("/"): |
289 | | - base += "/" |
290 | | - |
291 | | - absolute = urljoin(f"https://example.invalid{base}", value) |
292 | | - parsed = urlparse(absolute) |
293 | | - if parsed.scheme not in ("http", "https"): |
| 279 | + try: |
| 280 | + return Path(value).relative_to(Path(public_path).parent, walk_up=True).as_posix() |
| 281 | + except ValueError: |
| 282 | + print(f"Error normalizing path {value} within page {public_path}") |
294 | 283 | return None |
295 | | - if parsed.netloc != "example.invalid": |
296 | | - return value |
297 | | - |
298 | | - path = parsed.path or "/" |
299 | | - if not path.startswith("/"): |
300 | | - path = "/" + path |
301 | | - |
302 | | - return urlunparse(("", "", path, "", parsed.query, parsed.fragment)) |
303 | 284 |
|
304 | 285 | def _wrap_figures_and_tables(self, soup) -> None: |
305 | 286 | main_content = soup.find(id="main-content") |
@@ -437,7 +418,7 @@ def _linkify_schema_names(self, soup, public_path: str, collector: ListingCollec |
437 | 418 | if start > cursor: |
438 | 419 | replacement.append(text[cursor:start]) |
439 | 420 | raw = match.group(0) |
440 | | - anchor = soup.new_tag("a", href=f"/lexical/{canonical}.htm") |
| 421 | + anchor = soup.new_tag("a", href=f"/lexical/{canonical}.html") |
441 | 422 | anchor.string = raw |
442 | 423 | replacement.append(anchor) |
443 | 424 | collector.add_reference(canonical, public_path) |
@@ -516,7 +497,7 @@ def _render_named_segments(self, token_type, value: str) -> str: |
516 | 497 | canonical = self.canonical_names.get(value.upper()) |
517 | 498 | if canonical is None: |
518 | 499 | return segment |
519 | | - return f'<a href="/lexical/{canonical}.htm">{segment}</a>' |
| 500 | + return f'<a href="/lexical/{canonical}.html">{segment}</a>' |
520 | 501 |
|
521 | 502 | def _iter_schema_matches(self, value: str): |
522 | 503 | for match in self.name_pattern.finditer(value): |
|
0 commit comments