diff --git a/shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py b/shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py index 13480dd..a382944 100644 --- a/shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py +++ b/shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py @@ -33,7 +33,7 @@ import enum import logging import re -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple from util import promo_text_remover as promo_text_remover_lib from flask import current_app @@ -62,7 +62,8 @@ TITLE_WORD_ORDER_CONFIG = None BLOCKLIST_CONFIG = None TITLE_WORD_ORDER_OPTIONS_CONFIG = None -CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[str, str]], List[str]] = None +CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[Pattern[str], str]], + List[str]] = None def _get_required_configs(): @@ -179,6 +180,12 @@ def _optimize( optimization_level = self._get_optimization_level() + title_word_order_dictionary = title_word_order_config.get( + _PHRASE_DICTIONARY_CONFIG_KEY, {}) + + regex_dictionary_terms = regex_util.generate_regex_term_dict( + title_word_order_dictionary) + for entry in product_batch['entries']: if optimization_util.optimization_exclusion_specified( @@ -212,8 +219,6 @@ def _optimize( keyword_weights_mapping = title_word_order_config.get( _KEYWORD_WEIGHTS_MAPPING_CONFIG_KEY, {}) keywords_for_gpc = keyword_weights_mapping.get(str(gpc_id), []) - title_word_order_dictionary = title_word_order_config.get( - _PHRASE_DICTIONARY_CONFIG_KEY, {}) allowed_keywords_for_gpc = _remove_keywords_in_blocklist( keywords_for_gpc, keyword_blocklist) @@ -223,8 +228,7 @@ def _optimize( allowed_keywords_for_gpc) title_to_process = original_title - regex_dictionary_terms = regex_util.generate_regex_term_dict( - title_word_order_dictionary) + title_words = _tokenize_text(title_to_process, language, regex_dictionary_terms) description_words = _tokenize_text( @@ -383,8 +387,9 @@ def _remove_keywords_in_blocklist( return allowed_keywords -def _tokenize_text(text: str, language: str, - regex_dictionary_terms: Dict[str, str]) -> List[str]: +def _tokenize_text( + text: str, language: str, regex_dictionary_terms: Dict[Pattern[str], + str]) -> List[str]: """Splits text into individual words using the correct method for the given language. Args: @@ -405,7 +410,7 @@ def _tokenize_text(text: str, language: str, def _split_words_in_japanese( - text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]: + text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]: """Splits Japanese text into words by using MeCab. If a group of words in the text match a regex in the @@ -437,7 +442,7 @@ def _split_words_in_japanese( def _split_words_in_western_languages( - text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]: + text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]: """Splits western text into words. If a group of words in the text match a regex in the diff --git a/shoptimizer_api/util/regex_util.py b/shoptimizer_api/util/regex_util.py index 229ccfa..e26ceac 100644 --- a/shoptimizer_api/util/regex_util.py +++ b/shoptimizer_api/util/regex_util.py @@ -24,8 +24,8 @@ By using regex, we can check for matches without having to transform every title with strip(). """ - -from typing import Dict, List +import re +from typing import Dict, List, Pattern # Ignores 0+ whitespace or full-width space characters. _WHITESPACE_REGEX = '(\s| )*' @@ -58,10 +58,10 @@ def convert_to_regex_str_that_ignores_spaces(term: str) -> str: # Converts the list of chars back to a string and removes last regex. regex_term = ''.join(regex_term)[:-len(_WHITESPACE_REGEX)] - return regex_term + return re.compile(regex_term) -def generate_regex_term_dict(terms: List[str]) -> Dict[str, str]: +def generate_regex_term_dict(terms: List[str]) -> Dict[Pattern[str], str]: r"""Convert the list of terms into a regex to term dictionary. The regex matches the terms regardless of whitespace. diff --git a/shoptimizer_api/util/regex_util_test.py b/shoptimizer_api/util/regex_util_test.py index 09902a6..93f6a10 100644 --- a/shoptimizer_api/util/regex_util_test.py +++ b/shoptimizer_api/util/regex_util_test.py @@ -15,6 +15,8 @@ """Tests for regex_util.""" +import re + from absl.testing import parameterized from util import regex_util @@ -40,7 +42,7 @@ class RegexUtilTest(parameterized.TestCase): ) def test_convert_to_regex_str_that_ignores_spaces(self, term, expected_regex): actual_regex = regex_util.convert_to_regex_str_that_ignores_spaces(term) - self.assertEqual(expected_regex, actual_regex) + self.assertEqual(expected_regex, actual_regex.pattern) def test_generate_regex_term_dict(self): terms = ['E Term', '商品', ''] @@ -48,9 +50,9 @@ def test_generate_regex_term_dict(self): actual_regex_to_term = regex_util.generate_regex_term_dict(terms) expected_regex_to_term = { - 'E(\\s| )*T(\\s| )*e(\\s| )*r(\\s| )*m': 'E Term', - '商(\\s| )*品': '商品', - '': '' + re.compile('E(\\s| )*T(\\s| )*e(\\s| )*r(\\s| )*m'): 'E Term', + re.compile('商(\\s| )*品'): '商品', + re.compile(''): '' } self.assertEqual(expected_regex_to_term, actual_regex_to_term)