Skip to content

Commit a228fad

Browse files
committed
Add Unicode Normalization for Search Indexing
- Introduced `html_search_unicode_normalization` configuration option to specify Unicode normalization form (NFC, NFD, NFKC, NFKD) for search indexing. - Updated the HTML builder to pass the normalization configuration to the search indexer. - Modified the `IndexBuilder` and `_feed_visit_nodes` functions to apply the specified Unicode normalization to document text before indexing. - Updated JavaScript search tools to normalize search queries using the specified normalization form. - Added documentation for the new configuration option in `configuration.rst`. - Implemented a test case to verify that full-width characters like 'Python' are normalized and indexed as 'python'.
1 parent 0215a73 commit a228fad

File tree

9 files changed

+91
-9
lines changed

9 files changed

+91
-9
lines changed

AUTHORS.rst

+1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ Contributors
107107
* Thomas Lamb -- linkcheck builder
108108
* Thomas Waldmann -- apidoc module fixes
109109
* Tim Hoffmann -- theme improvements
110+
* Tokuhiro Matsuno -- search unicode normalization
110111
* Vince Salvino -- JavaScript search improvements
111112
* Will Maier -- directory HTML builder
112113
* Zac Hatfield-Dodds -- doctest reporting improvements, intersphinx performance

CHANGES.rst

+12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
1+
Release 8.3.0 (in development)
2+
==============================
3+
4+
Features added
5+
--------------
6+
7+
* #13384: Add Unicode normalization option for search indexing.
8+
This allows users to specify the type of Unicode normalization
9+
(NFC, NFD, NFKC, NFKD) to apply during searches, improving the
10+
accuracy and reliability of search results.
11+
Patch by Tokuhiro Matsuno.
12+
113
Release 8.2.1 (released Feb 21, 2025)
214
=====================================
315

doc/usage/configuration.rst

+19
Original file line numberDiff line numberDiff line change
@@ -2030,6 +2030,25 @@ and also make use of these options.
20302030

20312031
.. versionadded:: 1.0
20322032

2033+
.. confval:: html_search_unicode_normalization
2034+
:type: :code-py:`str`
2035+
:default: :code-py:`None`
2036+
2037+
html_search_unicode_normalization is a setting that specifies the type
2038+
of Unicode normalization to apply during searches. It can take one of
2039+
the following values:
2040+
2041+
* **NFD** -- Decomposes characters into their canonical decomposed form.
2042+
* **NFC** -- Composes characters into their canonical composed form.
2043+
* **NFKD** -- Decomposes characters into their compatibility decomposed form.
2044+
* **NFKC** -- Composes characters into their compatibility composed form.
2045+
2046+
This setting ensures that text is consistently normalized, improving the
2047+
accuracy and reliability of search results by handling different Unicode
2048+
representations of the same characters.
2049+
2050+
.. versionadded:: 8.3
2051+
20332052
.. confval:: html_search_language
20342053
:type: :code-py:`str`
20352054
:default: The value of **language**

sphinx/builders/html/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
440440
lang,
441441
self.config.html_search_options,
442442
self.config.html_search_scorer,
443+
self.config.html_search_unicode_normalization,
443444
)
444445
self.load_indexer(docnames)
445446

@@ -544,6 +545,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
544545
'has_source': self.config.html_copy_source,
545546
'show_source': self.config.html_show_sourcelink,
546547
'sourcelink_suffix': self.config.html_sourcelink_suffix,
548+
'search_unicode_normalization': self.config.html_search_unicode_normalization,
547549
'file_suffix': self.out_suffix,
548550
'link_suffix': self.link_suffix,
549551
'script_files': self._js_files,
@@ -1490,6 +1492,9 @@ def setup(app: Sphinx) -> ExtensionMetadata:
14901492
app.add_config_value(
14911493
'html_show_search_summary', True, 'html', types=frozenset({bool})
14921494
)
1495+
app.add_config_value(
1496+
'html_search_unicode_normalization', None, 'html', types=frozenset({str})
1497+
)
14931498
app.add_config_value('html_show_sphinx', True, 'html', types=frozenset({bool}))
14941499
app.add_config_value('html_context', {}, 'html', types=frozenset({dict}))
14951500
app.add_config_value(

sphinx/search/__init__.py

+35-9
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import os
1010
import pickle
1111
import re
12+
import unicodedata
1213
from importlib import import_module
1314
from typing import TYPE_CHECKING
1415

@@ -21,7 +22,7 @@
2122

2223
if TYPE_CHECKING:
2324
from collections.abc import Callable, Iterable
24-
from typing import Any, Protocol, TypeVar
25+
from typing import Any, Literal, Protocol, TypeVar
2526

2627
from docutils.nodes import Node
2728

@@ -275,7 +276,12 @@ class IndexBuilder:
275276
}
276277

277278
def __init__(
278-
self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str
279+
self,
280+
env: BuildEnvironment,
281+
lang: str,
282+
options: dict[str, str],
283+
scoring: str,
284+
normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None = None,
279285
) -> None:
280286
self._domains = env.domains
281287
self._env_version = env.version
@@ -301,6 +307,7 @@ def __init__(
301307
self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames
302308
# add language-specific SearchLanguage instance
303309
lang_class = languages.get(lang)
310+
self._unicode_normalization = normalization
304311

305312
# fallback; try again with language-code
306313
if lang_class is None and '_' in lang:
@@ -552,7 +559,11 @@ def _word_collector(self, doctree: nodes.document) -> WordStore:
552559
split = self.lang.split
553560
language = self.lang.lang
554561
_feed_visit_nodes(
555-
doctree, word_store=word_store, split=split, language=language
562+
doctree,
563+
word_store=word_store,
564+
split=split,
565+
language=language,
566+
normalization=self._unicode_normalization,
556567
)
557568
return word_store
558569

@@ -602,7 +613,14 @@ def _feed_visit_nodes(
602613
word_store: WordStore,
603614
split: Callable[[str], list[str]],
604615
language: str,
616+
normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None,
605617
) -> None:
618+
def normalize(text: str) -> str:
619+
if normalization:
620+
return unicodedata.normalize(normalization, text)
621+
else:
622+
return text
623+
606624
if isinstance(node, nodes.comment):
607625
return
608626
elif isinstance(node, nodes.Element) and 'no-search' in node['classes']:
@@ -626,18 +644,26 @@ def _feed_visit_nodes(
626644
flags=re.IGNORECASE | re.DOTALL,
627645
)
628646
nodetext = re.sub(r'<[^<]+?>', '', nodetext)
629-
word_store.words.extend(split(nodetext))
647+
word_store.words.extend(split(normalize(nodetext)))
630648
return
631649
elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language):
632-
keywords = [keyword.strip() for keyword in node['content'].split(',')]
650+
keywords = [
651+
normalize(keyword.strip()) for keyword in node['content'].split(',')
652+
]
633653
word_store.words.extend(keywords)
634654
elif isinstance(node, nodes.Text):
635-
word_store.words.extend(split(node.astext()))
655+
word_store.words.extend(split(normalize(node.astext())))
636656
elif isinstance(node, nodes.title):
637657
title, is_main_title = node.astext(), len(word_store.titles) == 0
638658
ids = node.parent['ids']
639659
title_node_id = None if is_main_title else ids[0] if ids else None
640-
word_store.titles.append((title, title_node_id))
641-
word_store.title_words.extend(split(title))
660+
word_store.titles.append((normalize(title), title_node_id))
661+
word_store.title_words.extend(split(normalize(title)))
642662
for child in node.children:
643-
_feed_visit_nodes(child, word_store=word_store, split=split, language=language)
663+
_feed_visit_nodes(
664+
child,
665+
word_store=word_store,
666+
split=split,
667+
language=language,
668+
normalization=normalization,
669+
)

sphinx/themes/basic/static/documentation_options.js.jinja

+1
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ const DOCUMENTATION_OPTIONS = {
1010
NAVIGATION_WITH_KEYS: {{ 'true' if theme_navigation_with_keys|tobool else 'false'}},
1111
SHOW_SEARCH_SUMMARY: {{ 'true' if show_search_summary else 'false' }},
1212
ENABLE_SEARCH_SHORTCUTS: {{ 'true' if theme_enable_search_shortcuts|tobool else 'false'}},
13+
SEARCH_UNICODE_NORMALIZATION: {{ '"' + search_unicode_normalization + '"' if search_unicode_normalization else 'null' }},
1314
};

sphinx/themes/basic/static/searchtools.js

+3
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,9 @@ const Search = {
413413
},
414414

415415
query: (query) => {
416+
if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
417+
query = query.normalize(DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
418+
}
416419
const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query);
417420
const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms);
418421

tests/roots/test-search/tocitem.rst

+2
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ lorem ipsum
1515
模块中 CAS service部分
1616

1717
可以Chinesetesttwo查看
18+
19+
Python

tests/test_search.py

+13
Original file line numberDiff line numberDiff line change
@@ -484,3 +484,16 @@ def test_check_js_search_indexes(make_app, sphinx_test_tempdir, directory):
484484
f'Search index fixture {existing_searchindex} does not match regenerated copy.'
485485
)
486486
assert fresh_searchindex.read_bytes() == existing_searchindex.read_bytes(), msg
487+
488+
489+
@pytest.mark.sphinx(
490+
'html',
491+
testroot='search',
492+
confoverrides={'html_search_unicode_normalization': 'NFKC'},
493+
srcdir='search_normalize',
494+
)
495+
def test_search_index_unicode_normalize(app: SphinxTestApp) -> None:
496+
app.build(force_all=True)
497+
index = load_searchindex(app.outdir / 'searchindex.js')
498+
assert 'Python' not in index['terms']
499+
assert 'python' in index['terms']

0 commit comments

Comments
 (0)