From 2412f6652f264e9165ec6feef6d6287c0c76bd5b Mon Sep 17 00:00:00 2001 From: Gavish Date: Mon, 26 Jul 2021 20:26:23 +0000 Subject: [PATCH] Fixes #930 --- dateparser/languages/locale.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dateparser/languages/locale.py b/dateparser/languages/locale.py index dba5528b0..cc726e5dc 100644 --- a/dateparser/languages/locale.py +++ b/dateparser/languages/locale.py @@ -176,6 +176,7 @@ def _generate_relative_translations(self, normalize=False): def translate_search(self, search_string, settings=None): dashes = ['-', '——', '—', '~'] + word_joint_unsupported_laguage = ["zh", "ja"] sentences = self._sentence_split(search_string, settings=settings) dictionary = self._get_dictionary(settings=settings) translated = [] @@ -184,10 +185,28 @@ def translate_search(self, search_string, settings=None): original_tokens, simplified_tokens = self._simplify_split_align(sentence, settings=settings) translated_chunk = [] original_chunk = [] + simplified_tokens_length = len(simplified_tokens) + skip_next_token = False for i, word in enumerate(simplified_tokens): + next_word = simplified_tokens[i + 1] if (simplified_tokens_length - 1) > i else "" + current_and_next_joined = self._join_chunk([word, next_word], settings=settings) + if skip_next_token: + skip_next_token = False + continue + if word == '' or word == ' ': translated_chunk.append(word) original_chunk.append(original_tokens[i]) + elif ( + current_and_next_joined in dictionary + and word not in dashes + and self.shortname not in word_joint_unsupported_laguage + ): + translated_chunk.append(dictionary[current_and_next_joined]) + original_chunk.append( + self._join_chunk([original_tokens[i], original_tokens[i + 1]], settings=settings) + ) + skip_next_token = True elif word in dictionary and word not in dashes: translated_chunk.append(dictionary[word]) original_chunk.append(original_tokens[i])