@@ -60,6 +60,12 @@ class RepoFileIndex(object):
6060 }
6161 }
6262
63+ """
64+ gse_standard_analyzer is for supporting Chinese text tokenization, but it also supports tokenization for English/German/French and so on:
65+ 1. It splits English sentences by spaces/punctuation
66+ 2. Example: The sentence "Hello world" will be tokenized into ["hello", "world"]
67+ 3. This meets the retrieval needs for English content (e.g., search "hello" can match the text containing "hello")
68+ """
6369 index_settings = {
6470 'analysis' : {
6571 'analyzer' : {
@@ -106,7 +112,7 @@ def __init__(self, seasearch_api, repo_data, shard_num, config):
106112 self .text_size_limit = 1 * 1024 * 1024 # 1M
107113 self .office_file_size_limit = 10 * 1024 * 1024 # 10M
108114 self .index_office_pdf = False
109- self . lang = 'chinese'
115+
110116 self .config = config
111117
112118 self ._parse_config ()
@@ -119,13 +125,8 @@ def _parse_config(self):
119125
120126 index_office_pdf = get_opt_from_conf_or_env (self .config , section_name , 'index_office_pdf' , default = False )
121127 self .index_office_pdf = parse_bool (index_office_pdf )
122- self .lang = get_opt_from_conf_or_env (self .config , section_name , 'lang' , default = 'chinese' )
123128
124129 def create_index_if_missing (self , index_name ):
125- if self .lang != 'chinese' :
126- self .mapping ['properties' ]['content' ]['analyzer' ] = 'standard'
127- self .index_settings ['analysis' ].pop ('char_filter' , None )
128- self .index_settings ['analysis' ]['analyzer' ].pop ('gse_standard_analyzer' , None )
129130 if not self .seasearch_api .check_index_mapping (index_name ).get ('is_exist' ):
130131 data = {
131132 'shard_num' : self .shard_num ,
0 commit comments