Internal change

Shoptimizer Team · starmandeluxe · commit 0e514a27c26c · 2022-04-07T15:29:25.000+09:00
PiperOrigin-RevId: 435239325
diff --git a/shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py b/shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py
@@ -33,7 +33,7 @@
 import enum
 import logging
 import re
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple
 from util import promo_text_remover as promo_text_remover_lib
 from flask import current_app
 
@@ -62,7 +62,8 @@
 TITLE_WORD_ORDER_CONFIG = None
 BLOCKLIST_CONFIG = None
 TITLE_WORD_ORDER_OPTIONS_CONFIG = None
-CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[str, str]], List[str]] = None
+CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[Pattern[str], str]],
+                                List[str]] = None
 
 
 def _get_required_configs():
@@ -179,6 +180,12 @@ def _optimize(
 
     optimization_level = self._get_optimization_level()
 
+    title_word_order_dictionary = title_word_order_config.get(
+        _PHRASE_DICTIONARY_CONFIG_KEY, {})
+
+    regex_dictionary_terms = regex_util.generate_regex_term_dict(
+        title_word_order_dictionary)
+
     for entry in product_batch['entries']:
 
       if optimization_util.optimization_exclusion_specified(
@@ -212,8 +219,6 @@ def _optimize(
       keyword_weights_mapping = title_word_order_config.get(
           _KEYWORD_WEIGHTS_MAPPING_CONFIG_KEY, {})
       keywords_for_gpc = keyword_weights_mapping.get(str(gpc_id), [])
-      title_word_order_dictionary = title_word_order_config.get(
-          _PHRASE_DICTIONARY_CONFIG_KEY, {})
 
       allowed_keywords_for_gpc = _remove_keywords_in_blocklist(
           keywords_for_gpc, keyword_blocklist)
@@ -223,8 +228,7 @@ def _optimize(
           allowed_keywords_for_gpc)
 
       title_to_process = original_title
-      regex_dictionary_terms = regex_util.generate_regex_term_dict(
-          title_word_order_dictionary)
+
       title_words = _tokenize_text(title_to_process, language,
                                    regex_dictionary_terms)
       description_words = _tokenize_text(
@@ -383,8 +387,9 @@ def _remove_keywords_in_blocklist(
   return allowed_keywords
 
 
-def _tokenize_text(text: str, language: str,
-                   regex_dictionary_terms: Dict[str, str]) -> List[str]:
+def _tokenize_text(
+    text: str, language: str, regex_dictionary_terms: Dict[Pattern[str],
+                                                           str]) -> List[str]:
   """Splits text into individual words using the correct method for the given language.
 
   Args:
@@ -405,7 +410,7 @@ def _tokenize_text(text: str, language: str,
 
 
 def _split_words_in_japanese(
-    text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]:
+    text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]:
   """Splits Japanese text into words by using MeCab.
 
   If a group of words in the text match a regex in the
@@ -437,7 +442,7 @@ def _split_words_in_japanese(
 
 
 def _split_words_in_western_languages(
-    text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]:
+    text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]:
   """Splits western text into words.
 
   If a group of words in the text match a regex in the
diff --git a/shoptimizer_api/util/regex_util.py b/shoptimizer_api/util/regex_util.py
@@ -24,8 +24,8 @@
 By using regex, we can check for matches without having to transform every title
 with strip().
 """
-
-from typing import Dict, List
+import re
+from typing import Dict, List, Pattern
 
 # Ignores 0+ whitespace or full-width space characters.
 _WHITESPACE_REGEX = '(\s|　)*'  
@@ -58,10 +58,10 @@ def convert_to_regex_str_that_ignores_spaces(term: str) -> str:
   # Converts the list of chars back to a string and removes last regex.
   regex_term = ''.join(regex_term)[:-len(_WHITESPACE_REGEX)]
 
-  return regex_term
+  return re.compile(regex_term)
 
 
-def generate_regex_term_dict(terms: List[str]) -> Dict[str, str]:
+def generate_regex_term_dict(terms: List[str]) -> Dict[Pattern[str], str]:
   r"""Convert the list of terms into a regex to term dictionary.
 
   The regex matches the terms regardless of whitespace.
diff --git a/shoptimizer_api/util/regex_util_test.py b/shoptimizer_api/util/regex_util_test.py
@@ -15,6 +15,8 @@
 
 """Tests for regex_util."""
 
+import re
+
 from absl.testing import parameterized
 from util import regex_util
 
@@ -40,17 +42,17 @@ class RegexUtilTest(parameterized.TestCase):
   )
   def test_convert_to_regex_str_that_ignores_spaces(self, term, expected_regex):
     actual_regex = regex_util.convert_to_regex_str_that_ignores_spaces(term)
-    self.assertEqual(expected_regex, actual_regex)
+    self.assertEqual(expected_regex, actual_regex.pattern)
 
   def test_generate_regex_term_dict(self):
     terms = ['E Term', '商品', '']
 
     actual_regex_to_term = regex_util.generate_regex_term_dict(terms)
 
     expected_regex_to_term = {
-        'E(\\s|　)*T(\\s|　)*e(\\s|　)*r(\\s|　)*m': 'E Term',
-        '商(\\s|　)*品': '商品',
-        '': ''
+        re.compile('E(\\s|　)*T(\\s|　)*e(\\s|　)*r(\\s|　)*m'): 'E Term',
+        re.compile('商(\\s|　)*品'): '商品',
+        re.compile(''): ''
     }
 
     self.assertEqual(expected_regex_to_term, actual_regex_to_term)