Skip to content

Commit c142fa0

Browse files
Fix seasearch key error (#612)
* Update repo_file_index.py * Update repo_file_index.py * add-notation * Update documentation for gse_standard_analyzer --------- Co-authored-by: Daniel Pan <freeplant@gmail.com>
1 parent ad83043 commit c142fa0

File tree

1 file changed

+7
-6
lines changed

1 file changed

+7
-6
lines changed

seasearch/index_store/repo_file_index.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ class RepoFileIndex(object):
6060
}
6161
}
6262

63+
"""
64+
gse_standard_analyzer is for supporting Chinese text tokenization, but it also supports tokenization for English/German/French and so on:
65+
1. It splits English sentences by spaces/punctuation
66+
2. Example: The sentence "Hello world" will be tokenized into ["hello", "world"]
67+
3. This meets the retrieval needs for English content (e.g., search "hello" can match the text containing "hello")
68+
"""
6369
index_settings = {
6470
'analysis': {
6571
'analyzer': {
@@ -106,7 +112,7 @@ def __init__(self, seasearch_api, repo_data, shard_num, config):
106112
self.text_size_limit = 1 * 1024 * 1024 # 1M
107113
self.office_file_size_limit = 10 * 1024 * 1024 # 10M
108114
self.index_office_pdf = False
109-
self.lang = 'chinese'
115+
110116
self.config = config
111117

112118
self._parse_config()
@@ -119,13 +125,8 @@ def _parse_config(self):
119125

120126
index_office_pdf = get_opt_from_conf_or_env(self.config, section_name, 'index_office_pdf', default=False)
121127
self.index_office_pdf = parse_bool(index_office_pdf)
122-
self.lang = get_opt_from_conf_or_env(self.config, section_name, 'lang', default='chinese')
123128

124129
def create_index_if_missing(self, index_name):
125-
if self.lang != 'chinese':
126-
self.mapping['properties']['content']['analyzer'] = 'standard'
127-
self.index_settings['analysis'].pop('char_filter', None)
128-
self.index_settings['analysis']['analyzer'].pop('gse_standard_analyzer', None)
129130
if not self.seasearch_api.check_index_mapping(index_name).get('is_exist'):
130131
data = {
131132
'shard_num': self.shard_num,

0 commit comments

Comments
 (0)