Fix seasearch key error (#612)

r350178982 · freeplant · web-flow · commit c142fa0fdeb4 · 2026-02-11T18:13:56.000+08:00
* Update repo_file_index.py

* Update repo_file_index.py

* add-notation

* Update documentation for gse_standard_analyzer

---------

Co-authored-by: Daniel Pan &lt;freeplant@gmail.com&gt;
diff --git a/seasearch/index_store/repo_file_index.py b/seasearch/index_store/repo_file_index.py
@@ -60,6 +60,12 @@ class RepoFileIndex(object):
         }
     }
 
+    """
+    gse_standard_analyzer is for supporting Chinese text tokenization, but it also supports tokenization for English/German/French and so on:
+    1. It splits English sentences by spaces/punctuation
+    2. Example: The sentence "Hello world" will be tokenized into ["hello", "world"]
+    3. This meets the retrieval needs for English content (e.g., search "hello" can match the text containing "hello")
+    """
     index_settings = {
         'analysis': {
             'analyzer': {
@@ -106,7 +112,7 @@ def __init__(self, seasearch_api, repo_data, shard_num, config):
         self.text_size_limit = 1 * 1024 * 1024  # 1M
         self.office_file_size_limit = 10 * 1024 * 1024  # 10M
         self.index_office_pdf = False
-        self.lang = 'chinese'
+
         self.config = config
 
         self._parse_config()
@@ -119,13 +125,8 @@ def _parse_config(self):
 
         index_office_pdf = get_opt_from_conf_or_env(self.config, section_name, 'index_office_pdf', default=False)
         self.index_office_pdf = parse_bool(index_office_pdf)
-        self.lang = get_opt_from_conf_or_env(self.config, section_name, 'lang', default='chinese')
 
     def create_index_if_missing(self, index_name):
-        if self.lang != 'chinese':
-            self.mapping['properties']['content']['analyzer'] = 'standard'
-            self.index_settings['analysis'].pop('char_filter', None)
-            self.index_settings['analysis']['analyzer'].pop('gse_standard_analyzer', None)
         if not self.seasearch_api.check_index_mapping(index_name).get('is_exist'):
             data = {
                 'shard_num': self.shard_num,