Skip to content

Commit 406b74e

Browse files
authored
fix: add opt-in ASCII slug generation (#97)
1 parent 6f59c9f commit 406b74e

10 files changed

Lines changed: 177 additions & 19 deletions

File tree

docs/Configuration.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Configuration lives in `rockgarden.toml` at the site root.
2121
| `url_style` | `str` | `"slug"` | How filenames become URLs. `"slug"` (lowercase, dashes), `"preserve-case"` (original casing, dashes), or `"preserve"` (original casing and spacing). See URL Styles below. |
2222
| `base_url` | `str` | `""` | Full base URL (e.g. `https://example.com/docs`). Used for feeds, sitemap, and deriving `base_path` when not set explicitly. Trailing slash is stripped automatically. |
2323
| `base_path` | `str` | `""` | URL path prefix for subdirectory deploys (e.g. `/docs`). When set, all generated URLs are prefixed with this path. If not set, derived from `base_url`. Trailing slash is stripped automatically. |
24+
| `ascii_urls` | `bool` | `false` | Transliterate Unicode characters to ASCII in URLs and tag slugs. `Café``cafe`, `東京``dong-jing`. Applies to all URL styles. |
2425

2526
### URL Styles
2627

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ dependencies = [
3030
"pyyaml>=6.0",
3131
"markdownify>=0.14.1",
3232
"pygments>=2.17.0",
33+
"python-slugify>=8.0.0",
3334
"watchfiles>=1.0.0",
3435
]
3536

src/rockgarden/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class SiteConfig(BaseModel):
1818
base_url: str = ""
1919
base_path: str = ""
2020
url_style: str = "slug"
21+
ascii_urls: bool = False
2122

2223
@field_validator("url_style", mode="after")
2324
@classmethod

src/rockgarden/content/loader.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,26 +57,33 @@ def should_ignore(path: Path, source: Path, patterns: list[str]) -> bool:
5757
return False
5858

5959

60-
def path_to_slug(path: Path, source: Path, url_style: str = "slug") -> str:
60+
def path_to_slug(
61+
path: Path,
62+
source: Path,
63+
url_style: str = "slug",
64+
ascii_urls: bool = False,
65+
) -> str:
6166
"""Convert a file path to a URL slug.
6267
6368
Args:
6469
path: The file path.
6570
source: The source root directory.
6671
url_style: URL style ("slug", "preserve-case", or "preserve").
72+
ascii_urls: When True, transliterate Unicode to ASCII.
6773
6874
Returns:
6975
The slug (e.g., 'index', 'npcs/olvir').
7076
"""
7177
rel_path = str(path.relative_to(source))
72-
return generate_slug(rel_path, style=url_style)
78+
return generate_slug(rel_path, style=url_style, ascii_urls=ascii_urls)
7379

7480

7581
def load_page(
7682
path: Path,
7783
source: Path,
7884
dates_config: DatesConfig | None = None,
7985
url_style: str = "slug",
86+
ascii_urls: bool = False,
8087
) -> Page:
8188
"""Load a single page from a markdown file.
8289
@@ -97,7 +104,7 @@ def load_page(
97104
if custom_slug := metadata.get("slug"):
98105
slug = custom_slug
99106
else:
100-
slug = path_to_slug(path, source, url_style)
107+
slug = path_to_slug(path, source, url_style, ascii_urls)
101108

102109
modified = _resolve_frontmatter_date(metadata, dates_config.modified_date_fields)
103110
if modified is None and dates_config.modified_date_fallback:
@@ -120,6 +127,7 @@ def load_content(
120127
ignore_patterns: list[str],
121128
dates_config: DatesConfig | None = None,
122129
url_style: str = "slug",
130+
ascii_urls: bool = False,
123131
) -> list[Page]:
124132
"""Discover and load all markdown files from source directory.
125133
@@ -137,7 +145,7 @@ def load_content(
137145
if should_ignore(path, source, ignore_patterns):
138146
continue
139147

140-
page = load_page(path, source, dates_config, url_style)
148+
page = load_page(path, source, dates_config, url_style, ascii_urls)
141149
pages.append(page)
142150

143151
return pages

src/rockgarden/output/builder.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,11 @@ def build_site(
420420
run_hooks(config.hooks.pre_build, "pre_build", cwd=site_root, env_vars=hook_env)
421421

422422
pages = load_content(
423-
source, config.build.ignore_patterns, config.dates, config.site.url_style
423+
source,
424+
config.build.ignore_patterns,
425+
config.dates,
426+
config.site.url_style,
427+
config.site.ascii_urls,
424428
)
425429
clean_urls = config.site.clean_urls
426430
base_path = config.site.base_path or get_base_path(config.site.base_url)
@@ -789,11 +793,18 @@ def build_site(
789793

790794
# Generate tag index pages if enabled
791795
if config.theme.tag_index:
792-
tags = collect_tags(pages)
796+
tags = collect_tags(pages, config.site.ascii_urls)
793797
if tags:
794798
tag_layout = resolve_layout({}, config.theme.default_layout)
795799
build_tag_pages(
796-
tags, env, site_config, output, clean_urls, base_path, tag_layout
800+
tags,
801+
env,
802+
site_config,
803+
output,
804+
clean_urls,
805+
base_path,
806+
tag_layout,
807+
config.site.ascii_urls,
797808
)
798809

799810
# Generate sitemap if base_url is configured

src/rockgarden/output/tags.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from rockgarden.urls import get_url, normalize_tag
99

1010

11-
def collect_tags(pages: list[Page]) -> dict[str, list[Page]]:
11+
def collect_tags(pages: list[Page], ascii_urls: bool = False) -> dict[str, list[Page]]:
1212
"""Return a mapping of normalized tag slug → list of pages with that tag.
1313
1414
Pages are included in the order they appear in the input list. Tags with
@@ -20,7 +20,7 @@ def collect_tags(pages: list[Page]) -> dict[str, list[Page]]:
2020
if isinstance(raw_tags, str):
2121
raw_tags = [raw_tags]
2222
for tag in raw_tags:
23-
slug = normalize_tag(tag)
23+
slug = normalize_tag(tag, ascii_urls)
2424
if slug:
2525
tags.setdefault(slug, []).append(page)
2626
return dict(sorted(tags.items()))
@@ -34,6 +34,7 @@ def build_tag_pages(
3434
clean_urls: bool = True,
3535
base_path: str = "",
3636
layout_template: str = "layouts/default.html",
37+
ascii_urls: bool = False,
3738
) -> None:
3839
"""Generate /tags/<slug>/ and /tags/ pages in the output directory."""
3940
tag_index_template = env.get_template("tag_index.html")
@@ -47,7 +48,11 @@ def _page_entry(p: Page) -> dict:
4748
"title": p.title,
4849
"subtitle": p.frontmatter.get("subtitle", ""),
4950
"url": get_url(p.slug, clean_urls, base_path),
50-
"tags": [normalize_tag(t) for t in raw_tags if normalize_tag(t)],
51+
"tags": [
52+
normalize_tag(t, ascii_urls)
53+
for t in raw_tags
54+
if normalize_tag(t, ascii_urls)
55+
],
5156
}
5257

5358
def _sorted_entries(pages: list[Page]) -> list[dict]:

src/rockgarden/render/engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ def create_engine(
8585
)
8686
env.filters["format_datetime"] = _make_format_datetime(config.dates.timezone)
8787
clean_urls = config.site.clean_urls
88-
env.globals["normalize_tag"] = normalize_tag
88+
ascii_urls = config.site.ascii_urls
89+
env.globals["normalize_tag"] = lambda tag: normalize_tag(tag, ascii_urls=ascii_urls)
8990
env.globals["tag_url"] = lambda slug: get_tag_url(slug, clean_urls, base_path)
9091
env.globals["tags_root_url"] = get_tags_root_url(clean_urls, base_path)
9192
env.globals["icon"] = _icon

src/rockgarden/urls.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import re
44
from urllib.parse import quote, urlparse
55

6+
from slugify import slugify as _ascii_slugify
7+
68

79
def slugify_heading(text: str) -> str:
810
"""Convert heading text to a URL-friendly slug.
@@ -16,18 +18,20 @@ def slugify_heading(text: str) -> str:
1618
return text
1719

1820

19-
def normalize_tag(tag: str) -> str:
21+
def normalize_tag(tag: str, ascii_urls: bool = False) -> str:
2022
"""Normalize a tag to a URL-safe slug.
2123
22-
Strips leading '#', lowercases, and replaces any character that is not
23-
alphanumeric, hyphen, or underscore with a hyphen. Prevents path traversal
24-
via tags containing '/' or '..'.
24+
Strips leading '#', lowercases, and replaces non-word characters with
25+
hyphens. When *ascii_urls* is True, transliterates Unicode to ASCII.
2526
2627
Tags 'Python', '#python', and 'python' all normalize to 'python'.
2728
Obsidian nested tags like 'character/pc' normalize to 'character-pc'.
2829
"""
29-
slug = tag.lstrip("#").lower()
30-
slug = re.sub(r"[^a-z0-9_-]", "-", slug)
30+
slug = tag.lstrip("#")
31+
if ascii_urls:
32+
return _ascii_slugify(slug, allow_unicode=False)
33+
slug = slug.lower()
34+
slug = re.sub(r"[^\w-]", "-", slug)
3135
slug = re.sub(r"-+", "-", slug)
3236
return slug.strip("-")
3337

@@ -46,25 +50,45 @@ def get_base_path(base_url: str) -> str:
4650
return path
4751

4852

49-
def generate_slug(relative_path: str, style: str = "slug") -> str:
53+
def generate_slug(
54+
relative_path: str, style: str = "slug", ascii_urls: bool = False
55+
) -> str:
5056
"""Generate a slug from a relative file path.
5157
5258
Args:
5359
relative_path: Relative path from source root (e.g., "NPCs/Olvir.md").
5460
style: "slug" (lowercase, dashes), "preserve-case" (original casing,
5561
dashes), or "preserve" (original casing and spacing).
62+
ascii_urls: When True, transliterate Unicode to ASCII.
5663
5764
Returns:
5865
Slug string. For slug: "npcs/olvir-the-wise".
5966
For preserve-case: "NPCs/Olvir-the-Wise".
6067
For preserve: "NPCs/Olvir the Wise".
6168
"""
6269
slug = re.sub(r"\.md$", "", relative_path, flags=re.IGNORECASE)
70+
71+
if ascii_urls:
72+
segments = slug.split("/")
73+
if style == "slug":
74+
segments = [_ascii_slugify(s, allow_unicode=False) for s in segments]
75+
elif style == "preserve":
76+
segments = [
77+
_ascii_slugify(s, allow_unicode=False, lowercase=False, separator=" ")
78+
for s in segments
79+
]
80+
else:
81+
segments = [
82+
_ascii_slugify(s, allow_unicode=False, lowercase=False)
83+
for s in segments
84+
]
85+
slug = "/".join(segments)
86+
6387
if style == "preserve":
6488
return slug
6589
slug = re.sub(r"[ _]+", "-", slug)
6690
slug = re.sub(r"-+", "-", slug)
67-
if style == "slug":
91+
if style == "slug" and not ascii_urls:
6892
slug = slug.lower()
6993
return slug
7094

tests/test_urls.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,3 +406,86 @@ def test_preserve_case_build_output(self, tmp_path):
406406

407407
html = (output / "index.html").read_text()
408408
assert "/Getting-Started/" in html
409+
410+
411+
class TestGenerateSlugAscii:
412+
def test_accented_latin(self):
413+
assert generate_slug("Café.md", ascii_urls=True) == "cafe"
414+
415+
def test_cjk(self):
416+
assert generate_slug("東京.md", ascii_urls=True) == "dong-jing"
417+
418+
def test_cyrillic(self):
419+
assert generate_slug("Москва.md", ascii_urls=True) == "moskva"
420+
421+
def test_nested_path(self):
422+
assert generate_slug("Cities/Zürich.md", ascii_urls=True) == "cities/zurich"
423+
424+
def test_plain_ascii_unaffected(self):
425+
assert generate_slug("about.md", ascii_urls=True) == "about"
426+
427+
def test_preserve_case_with_ascii(self):
428+
result = generate_slug("Café.md", style="preserve-case", ascii_urls=True)
429+
assert result == "Cafe"
430+
431+
def test_preserve_with_ascii(self):
432+
result = generate_slug("Café Latte.md", style="preserve", ascii_urls=True)
433+
assert result == "Cafe Latte"
434+
435+
def test_ascii_false_preserves_unicode(self):
436+
assert generate_slug("Café.md", ascii_urls=False) == "café"
437+
438+
439+
class TestNormalizeTagUnicode:
440+
"""Tests for the bug fix: Unicode preserved when ascii_urls=False."""
441+
442+
def test_accented_preserved(self):
443+
from rockgarden.urls import normalize_tag
444+
445+
assert normalize_tag("#Café") == "café"
446+
447+
def test_cjk_preserved(self):
448+
from rockgarden.urls import normalize_tag
449+
450+
result = normalize_tag("#東京")
451+
assert result == "東京"
452+
453+
def test_basic_tag_unchanged(self):
454+
from rockgarden.urls import normalize_tag
455+
456+
assert normalize_tag("#python") == "python"
457+
458+
def test_nested_tag(self):
459+
from rockgarden.urls import normalize_tag
460+
461+
assert normalize_tag("character/pc") == "character-pc"
462+
463+
464+
class TestNormalizeTagAscii:
465+
def test_accented_transliterated(self):
466+
from rockgarden.urls import normalize_tag
467+
468+
assert normalize_tag("#Café", ascii_urls=True) == "cafe"
469+
470+
def test_cjk_transliterated(self):
471+
from rockgarden.urls import normalize_tag
472+
473+
assert normalize_tag("#東京", ascii_urls=True) == "dong-jing"
474+
475+
def test_cyrillic_transliterated(self):
476+
from rockgarden.urls import normalize_tag
477+
478+
assert normalize_tag("#Москва", ascii_urls=True) == "moskva"
479+
480+
def test_basic_tag_unchanged(self):
481+
from rockgarden.urls import normalize_tag
482+
483+
assert normalize_tag("#python", ascii_urls=True) == "python"
484+
485+
486+
class TestAsciiUrlsConfig:
487+
def test_default_false(self):
488+
assert SiteConfig().ascii_urls is False
489+
490+
def test_set_true(self):
491+
assert SiteConfig(ascii_urls=True).ascii_urls is True

uv.lock

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)