Skip to content

Commit 0e514a2

Browse files
Shoptimizer Teamstarmandeluxe
authored andcommitted
Internal change
PiperOrigin-RevId: 435239325
1 parent 83c881b commit 0e514a2

File tree

3 files changed

+25
-18
lines changed

3 files changed

+25
-18
lines changed

shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
import enum
3434
import logging
3535
import re
36-
from typing import Any, Callable, Dict, List, Optional, Tuple
36+
from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple
3737
from util import promo_text_remover as promo_text_remover_lib
3838
from flask import current_app
3939

@@ -62,7 +62,8 @@
6262
TITLE_WORD_ORDER_CONFIG = None
6363
BLOCKLIST_CONFIG = None
6464
TITLE_WORD_ORDER_OPTIONS_CONFIG = None
65-
CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[str, str]], List[str]] = None
65+
CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[Pattern[str], str]],
66+
List[str]] = None
6667

6768

6869
def _get_required_configs():
@@ -179,6 +180,12 @@ def _optimize(
179180

180181
optimization_level = self._get_optimization_level()
181182

183+
title_word_order_dictionary = title_word_order_config.get(
184+
_PHRASE_DICTIONARY_CONFIG_KEY, {})
185+
186+
regex_dictionary_terms = regex_util.generate_regex_term_dict(
187+
title_word_order_dictionary)
188+
182189
for entry in product_batch['entries']:
183190

184191
if optimization_util.optimization_exclusion_specified(
@@ -212,8 +219,6 @@ def _optimize(
212219
keyword_weights_mapping = title_word_order_config.get(
213220
_KEYWORD_WEIGHTS_MAPPING_CONFIG_KEY, {})
214221
keywords_for_gpc = keyword_weights_mapping.get(str(gpc_id), [])
215-
title_word_order_dictionary = title_word_order_config.get(
216-
_PHRASE_DICTIONARY_CONFIG_KEY, {})
217222

218223
allowed_keywords_for_gpc = _remove_keywords_in_blocklist(
219224
keywords_for_gpc, keyword_blocklist)
@@ -223,8 +228,7 @@ def _optimize(
223228
allowed_keywords_for_gpc)
224229

225230
title_to_process = original_title
226-
regex_dictionary_terms = regex_util.generate_regex_term_dict(
227-
title_word_order_dictionary)
231+
228232
title_words = _tokenize_text(title_to_process, language,
229233
regex_dictionary_terms)
230234
description_words = _tokenize_text(
@@ -383,8 +387,9 @@ def _remove_keywords_in_blocklist(
383387
return allowed_keywords
384388

385389

386-
def _tokenize_text(text: str, language: str,
387-
regex_dictionary_terms: Dict[str, str]) -> List[str]:
390+
def _tokenize_text(
391+
text: str, language: str, regex_dictionary_terms: Dict[Pattern[str],
392+
str]) -> List[str]:
388393
"""Splits text into individual words using the correct method for the given language.
389394
390395
Args:
@@ -405,7 +410,7 @@ def _tokenize_text(text: str, language: str,
405410

406411

407412
def _split_words_in_japanese(
408-
text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]:
413+
text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]:
409414
"""Splits Japanese text into words by using MeCab.
410415
411416
If a group of words in the text match a regex in the
@@ -437,7 +442,7 @@ def _split_words_in_japanese(
437442

438443

439444
def _split_words_in_western_languages(
440-
text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]:
445+
text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]:
441446
"""Splits western text into words.
442447
443448
If a group of words in the text match a regex in the

shoptimizer_api/util/regex_util.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
By using regex, we can check for matches without having to transform every title
2525
with strip().
2626
"""
27-
28-
from typing import Dict, List
27+
import re
28+
from typing import Dict, List, Pattern
2929

3030
# Ignores 0+ whitespace or full-width space characters.
3131
_WHITESPACE_REGEX = '(\s| )*'
@@ -58,10 +58,10 @@ def convert_to_regex_str_that_ignores_spaces(term: str) -> str:
5858
# Converts the list of chars back to a string and removes last regex.
5959
regex_term = ''.join(regex_term)[:-len(_WHITESPACE_REGEX)]
6060

61-
return regex_term
61+
return re.compile(regex_term)
6262

6363

64-
def generate_regex_term_dict(terms: List[str]) -> Dict[str, str]:
64+
def generate_regex_term_dict(terms: List[str]) -> Dict[Pattern[str], str]:
6565
r"""Convert the list of terms into a regex to term dictionary.
6666
6767
The regex matches the terms regardless of whitespace.

shoptimizer_api/util/regex_util_test.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
"""Tests for regex_util."""
1717

18+
import re
19+
1820
from absl.testing import parameterized
1921
from util import regex_util
2022

@@ -40,17 +42,17 @@ class RegexUtilTest(parameterized.TestCase):
4042
)
4143
def test_convert_to_regex_str_that_ignores_spaces(self, term, expected_regex):
4244
actual_regex = regex_util.convert_to_regex_str_that_ignores_spaces(term)
43-
self.assertEqual(expected_regex, actual_regex)
45+
self.assertEqual(expected_regex, actual_regex.pattern)
4446

4547
def test_generate_regex_term_dict(self):
4648
terms = ['E Term', '商品', '']
4749

4850
actual_regex_to_term = regex_util.generate_regex_term_dict(terms)
4951

5052
expected_regex_to_term = {
51-
'E(\\s| )*T(\\s| )*e(\\s| )*r(\\s| )*m': 'E Term',
52-
'商(\\s| )*品': '商品',
53-
'': ''
53+
re.compile('E(\\s| )*T(\\s| )*e(\\s| )*r(\\s| )*m'): 'E Term',
54+
re.compile('商(\\s| )*品'): '商品',
55+
re.compile(''): ''
5456
}
5557

5658
self.assertEqual(expected_regex_to_term, actual_regex_to_term)

0 commit comments

Comments
 (0)