Add Unicode Normalization for Search Indexing

tokuhirom · tokuhirom · commit a228fad28448 · 2025-02-24T00:20:08.000+09:00
-  Introduced `html_search_unicode_normalization` configuration option to specify Unicode normalization form (NFC, NFD, NFKC, NFKD) for search indexing.
-  Updated the HTML builder to pass the normalization configuration to the search indexer.
-  Modified the `IndexBuilder` and `_feed_visit_nodes` functions to apply the specified Unicode normalization to document text before indexing.
-  Updated JavaScript search tools to normalize search queries using the specified normalization form.
-  Added documentation for the new configuration option in `configuration.rst`.
-  Implemented a test case to verify that full-width characters like 'Ｐｙｔｈｏｎ' are normalized and indexed as 'python'.
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -107,6 +107,7 @@ Contributors
 * Thomas Lamb -- linkcheck builder
 * Thomas Waldmann -- apidoc module fixes
 * Tim Hoffmann -- theme improvements
+* Tokuhiro Matsuno -- search unicode normalization
 * Vince Salvino -- JavaScript search improvements
 * Will Maier -- directory HTML builder
 * Zac Hatfield-Dodds -- doctest reporting improvements, intersphinx performance
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,3 +1,15 @@
+Release 8.3.0 (in development)
+==============================
+
+Features added
+--------------
+
+* #13384: Add Unicode normalization option for search indexing.
+  This allows users to specify the type of Unicode normalization
+  (NFC, NFD, NFKC, NFKD) to apply during searches, improving the
+  accuracy and reliability of search results.
+  Patch by Tokuhiro Matsuno.
+
 Release 8.2.1 (released Feb 21, 2025)
 =====================================
 
diff --git a/doc/usage/configuration.rst b/doc/usage/configuration.rst
@@ -2030,6 +2030,25 @@ and also make use of these options.
 
    .. versionadded:: 1.0
 
+.. confval:: html_search_unicode_normalization
+   :type: :code-py:`str`
+   :default: :code-py:`None`
+
+    html_search_unicode_normalization is a setting that specifies the type
+    of Unicode normalization to apply during searches. It can take one of
+    the following values:
+
+    * **NFD** -- Decomposes characters into their canonical decomposed form.
+    * **NFC** -- Composes characters into their canonical composed form.
+    * **NFKD** -- Decomposes characters into their compatibility decomposed form.
+    * **NFKC** -- Composes characters into their compatibility composed form.
+
+    This setting ensures that text is consistently normalized, improving the
+    accuracy and reliability of search results by handling different Unicode
+    representations of the same characters.
+
+   .. versionadded:: 8.3
+
 .. confval:: html_search_language
    :type: :code-py:`str`
    :default: The value of **language**
diff --git a/sphinx/builders/html/__init__.py b/sphinx/builders/html/__init__.py
@@ -440,6 +440,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
                 lang,
                 self.config.html_search_options,
                 self.config.html_search_scorer,
+                self.config.html_search_unicode_normalization,
             )
             self.load_indexer(docnames)
 
@@ -544,6 +545,7 @@ def prepare_writing(self, docnames: Set[str]) -> None:
             'has_source': self.config.html_copy_source,
             'show_source': self.config.html_show_sourcelink,
             'sourcelink_suffix': self.config.html_sourcelink_suffix,
+            'search_unicode_normalization': self.config.html_search_unicode_normalization,
             'file_suffix': self.out_suffix,
             'link_suffix': self.link_suffix,
             'script_files': self._js_files,
@@ -1490,6 +1492,9 @@ def setup(app: Sphinx) -> ExtensionMetadata:
     app.add_config_value(
         'html_show_search_summary', True, 'html', types=frozenset({bool})
     )
+    app.add_config_value(
+        'html_search_unicode_normalization', None, 'html', types=frozenset({str})
+    )
     app.add_config_value('html_show_sphinx', True, 'html', types=frozenset({bool}))
     app.add_config_value('html_context', {}, 'html', types=frozenset({dict}))
     app.add_config_value(
diff --git a/sphinx/search/__init__.py b/sphinx/search/__init__.py
@@ -9,6 +9,7 @@
 import os
 import pickle
 import re
+import unicodedata
 from importlib import import_module
 from typing import TYPE_CHECKING
 
@@ -21,7 +22,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Iterable
-    from typing import Any, Protocol, TypeVar
+    from typing import Any, Literal, Protocol, TypeVar
 
     from docutils.nodes import Node
 
@@ -275,7 +276,12 @@ class IndexBuilder:
     }
 
     def __init__(
-        self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str
+        self,
+        env: BuildEnvironment,
+        lang: str,
+        options: dict[str, str],
+        scoring: str,
+        normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None = None,
     ) -> None:
         self._domains = env.domains
         self._env_version = env.version
@@ -301,6 +307,7 @@ def __init__(
         self._objnames: dict[int, tuple[str, str, str]] = env._search_index_objnames
         # add language-specific SearchLanguage instance
         lang_class = languages.get(lang)
+        self._unicode_normalization = normalization
 
         # fallback; try again with language-code
         if lang_class is None and '_' in lang:
@@ -552,7 +559,11 @@ def _word_collector(self, doctree: nodes.document) -> WordStore:
         split = self.lang.split
         language = self.lang.lang
         _feed_visit_nodes(
-            doctree, word_store=word_store, split=split, language=language
+            doctree,
+            word_store=word_store,
+            split=split,
+            language=language,
+            normalization=self._unicode_normalization,
         )
         return word_store
 
@@ -602,7 +613,14 @@ def _feed_visit_nodes(
     word_store: WordStore,
     split: Callable[[str], list[str]],
     language: str,
+    normalization: Literal['NFC', 'NFKC', 'NFD', 'NFKD'] | None,
 ) -> None:
+    def normalize(text: str) -> str:
+        if normalization:
+            return unicodedata.normalize(normalization, text)
+        else:
+            return text
+
     if isinstance(node, nodes.comment):
         return
     elif isinstance(node, nodes.Element) and 'no-search' in node['classes']:
@@ -626,18 +644,26 @@ def _feed_visit_nodes(
                 flags=re.IGNORECASE | re.DOTALL,
             )
             nodetext = re.sub(r'<[^<]+?>', '', nodetext)
-            word_store.words.extend(split(nodetext))
+            word_store.words.extend(split(normalize(nodetext)))
         return
     elif isinstance(node, nodes.meta) and _is_meta_keywords(node, language):
-        keywords = [keyword.strip() for keyword in node['content'].split(',')]
+        keywords = [
+            normalize(keyword.strip()) for keyword in node['content'].split(',')
+        ]
         word_store.words.extend(keywords)
     elif isinstance(node, nodes.Text):
-        word_store.words.extend(split(node.astext()))
+        word_store.words.extend(split(normalize(node.astext())))
     elif isinstance(node, nodes.title):
         title, is_main_title = node.astext(), len(word_store.titles) == 0
         ids = node.parent['ids']
         title_node_id = None if is_main_title else ids[0] if ids else None
-        word_store.titles.append((title, title_node_id))
-        word_store.title_words.extend(split(title))
+        word_store.titles.append((normalize(title), title_node_id))
+        word_store.title_words.extend(split(normalize(title)))
     for child in node.children:
-        _feed_visit_nodes(child, word_store=word_store, split=split, language=language)
+        _feed_visit_nodes(
+            child,
+            word_store=word_store,
+            split=split,
+            language=language,
+            normalization=normalization,
+        )
diff --git a/sphinx/themes/basic/static/documentation_options.js.jinja b/sphinx/themes/basic/static/documentation_options.js.jinja
@@ -10,4 +10,5 @@ const DOCUMENTATION_OPTIONS = {
     NAVIGATION_WITH_KEYS: {{ 'true' if theme_navigation_with_keys|tobool else 'false'}},
     SHOW_SEARCH_SUMMARY: {{ 'true' if show_search_summary else 'false' }},
     ENABLE_SEARCH_SHORTCUTS: {{ 'true' if theme_enable_search_shortcuts|tobool else 'false'}},
+    SEARCH_UNICODE_NORMALIZATION: {{ '"' + search_unicode_normalization + '"' if search_unicode_normalization else 'null' }},
 };
diff --git a/sphinx/themes/basic/static/searchtools.js b/sphinx/themes/basic/static/searchtools.js
@@ -413,6 +413,9 @@ const Search = {
   },
 
   query: (query) => {
+    if (DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION) {
+        query = query.normalize(DOCUMENTATION_OPTIONS.SEARCH_UNICODE_NORMALIZATION);
+    }
     const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query);
     const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms);
 
diff --git a/tests/roots/test-search/tocitem.rst b/tests/roots/test-search/tocitem.rst
@@ -15,3 +15,5 @@ lorem ipsum
 模块中 CAS service部分
 
 可以Chinesetesttwo查看
+
+Ｐｙｔｈｏｎ
diff --git a/tests/test_search.py b/tests/test_search.py
@@ -484,3 +484,16 @@ def test_check_js_search_indexes(make_app, sphinx_test_tempdir, directory):
         f'Search index fixture {existing_searchindex} does not match regenerated copy.'
     )
     assert fresh_searchindex.read_bytes() == existing_searchindex.read_bytes(), msg
+
+
+@pytest.mark.sphinx(
+    'html',
+    testroot='search',
+    confoverrides={'html_search_unicode_normalization': 'NFKC'},
+    srcdir='search_normalize',
+)
+def test_search_index_unicode_normalize(app: SphinxTestApp) -> None:
+    app.build(force_all=True)
+    index = load_searchindex(app.outdir / 'searchindex.js')
+    assert 'Ｐｙｔｈｏｎ' not in index['terms']
+    assert 'python' in index['terms']