Skip to content

Commit

Permalink
Internal change
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 435239325
  • Loading branch information
Shoptimizer Team authored and starmandeluxe committed Apr 7, 2022
1 parent 83c881b commit 0e514a2
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 18 deletions.
25 changes: 15 additions & 10 deletions shoptimizer_api/optimizers_builtin/title_word_order_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import enum
import logging
import re
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple
from util import promo_text_remover as promo_text_remover_lib
from flask import current_app

Expand Down Expand Up @@ -62,7 +62,8 @@
TITLE_WORD_ORDER_CONFIG = None
BLOCKLIST_CONFIG = None
TITLE_WORD_ORDER_OPTIONS_CONFIG = None
CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[str, str]], List[str]] = None
CUSTOM_TEXT_TOKENIZER: Callable[[str, str, Dict[Pattern[str], str]],
List[str]] = None


def _get_required_configs():
Expand Down Expand Up @@ -179,6 +180,12 @@ def _optimize(

optimization_level = self._get_optimization_level()

title_word_order_dictionary = title_word_order_config.get(
_PHRASE_DICTIONARY_CONFIG_KEY, {})

regex_dictionary_terms = regex_util.generate_regex_term_dict(
title_word_order_dictionary)

for entry in product_batch['entries']:

if optimization_util.optimization_exclusion_specified(
Expand Down Expand Up @@ -212,8 +219,6 @@ def _optimize(
keyword_weights_mapping = title_word_order_config.get(
_KEYWORD_WEIGHTS_MAPPING_CONFIG_KEY, {})
keywords_for_gpc = keyword_weights_mapping.get(str(gpc_id), [])
title_word_order_dictionary = title_word_order_config.get(
_PHRASE_DICTIONARY_CONFIG_KEY, {})

allowed_keywords_for_gpc = _remove_keywords_in_blocklist(
keywords_for_gpc, keyword_blocklist)
Expand All @@ -223,8 +228,7 @@ def _optimize(
allowed_keywords_for_gpc)

title_to_process = original_title
regex_dictionary_terms = regex_util.generate_regex_term_dict(
title_word_order_dictionary)

title_words = _tokenize_text(title_to_process, language,
regex_dictionary_terms)
description_words = _tokenize_text(
Expand Down Expand Up @@ -383,8 +387,9 @@ def _remove_keywords_in_blocklist(
return allowed_keywords


def _tokenize_text(text: str, language: str,
regex_dictionary_terms: Dict[str, str]) -> List[str]:
def _tokenize_text(
text: str, language: str, regex_dictionary_terms: Dict[Pattern[str],
str]) -> List[str]:
"""Splits text into individual words using the correct method for the given language.
Args:
Expand All @@ -405,7 +410,7 @@ def _tokenize_text(text: str, language: str,


def _split_words_in_japanese(
text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]:
text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]:
"""Splits Japanese text into words by using MeCab.
If a group of words in the text match a regex in the
Expand Down Expand Up @@ -437,7 +442,7 @@ def _split_words_in_japanese(


def _split_words_in_western_languages(
text: str, regex_dictionary_terms: Dict[str, str]) -> List[str]:
text: str, regex_dictionary_terms: Dict[Pattern[str], str]) -> List[str]:
"""Splits western text into words.
If a group of words in the text match a regex in the
Expand Down
8 changes: 4 additions & 4 deletions shoptimizer_api/util/regex_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
By using regex, we can check for matches without having to transform every title
with strip().
"""

from typing import Dict, List
import re
from typing import Dict, List, Pattern

# Ignores 0+ whitespace or full-width space characters.
_WHITESPACE_REGEX = '(\s| )*'
Expand Down Expand Up @@ -58,10 +58,10 @@ def convert_to_regex_str_that_ignores_spaces(term: str) -> str:
# Converts the list of chars back to a string and removes last regex.
regex_term = ''.join(regex_term)[:-len(_WHITESPACE_REGEX)]

return regex_term
return re.compile(regex_term)


def generate_regex_term_dict(terms: List[str]) -> Dict[str, str]:
def generate_regex_term_dict(terms: List[str]) -> Dict[Pattern[str], str]:
r"""Convert the list of terms into a regex to term dictionary.
The regex matches the terms regardless of whitespace.
Expand Down
10 changes: 6 additions & 4 deletions shoptimizer_api/util/regex_util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

"""Tests for regex_util."""

import re

from absl.testing import parameterized
from util import regex_util

Expand All @@ -40,17 +42,17 @@ class RegexUtilTest(parameterized.TestCase):
)
def test_convert_to_regex_str_that_ignores_spaces(self, term, expected_regex):
actual_regex = regex_util.convert_to_regex_str_that_ignores_spaces(term)
self.assertEqual(expected_regex, actual_regex)
self.assertEqual(expected_regex, actual_regex.pattern)

def test_generate_regex_term_dict(self):
terms = ['E Term', '商品', '']

actual_regex_to_term = regex_util.generate_regex_term_dict(terms)

expected_regex_to_term = {
'E(\\s| )*T(\\s| )*e(\\s| )*r(\\s| )*m': 'E Term',
'商(\\s| )*品': '商品',
'': ''
re.compile('E(\\s| )*T(\\s| )*e(\\s| )*r(\\s| )*m'): 'E Term',
re.compile('商(\\s| )*品'): '商品',
re.compile(''): ''
}

self.assertEqual(expected_regex_to_term, actual_regex_to_term)

0 comments on commit 0e514a2

Please sign in to comment.