From 988392a0c821494fee2d90090cdca4c3c98bcf83 Mon Sep 17 00:00:00 2001 From: Toshiyuki Hanaoka Date: Sat, 2 May 2015 15:04:10 -0700 Subject: [PATCH] Implement rule based zero query suggestion. BUG=none TEST=unittest --- src/data/zero_query/zero_query.def | 8 + src/mozc_version_template.txt | 2 +- src/prediction/dictionary_predictor.cc | 130 ++++++++++---- src/prediction/dictionary_predictor.h | 7 + src/prediction/dictionary_predictor_test.cc | 49 +++++- ...en_embedded_string_array_for_zero_query.py | 164 ++++++++++++++++++ src/prediction/gen_zero_query_number_data.py | 72 -------- src/prediction/prediction.gyp | 40 ++++- 8 files changed, 362 insertions(+), 110 deletions(-) create mode 100644 src/data/zero_query/zero_query.def create mode 100644 src/prediction/gen_embedded_string_array_for_zero_query.py delete mode 100644 src/prediction/gen_zero_query_number_data.py diff --git a/src/data/zero_query/zero_query.def b/src/data/zero_query/zero_query.def new file mode 100644 index 000000000..af4b034c6 --- /dev/null +++ b/src/data/zero_query/zero_query.def @@ -0,0 +1,8 @@ +# Rules for triggering zero query suggestion/prediction. +# File format: +# triggercandidate_1,candidate_2,...,candidate_n +# ... +# Note that '#' is special caracter for comment line, so it cannot be placed +# at the beginning of a line. + +@ gmail.com diff --git a/src/mozc_version_template.txt b/src/mozc_version_template.txt index c2a53392d..7556748b3 100644 --- a/src/mozc_version_template.txt +++ b/src/mozc_version_template.txt @@ -1,6 +1,6 @@ MAJOR=2 MINOR=17 -BUILD=2079 +BUILD=2080 REVISION=102 # NACL_DICTIONARY_VERSION is the target version of the system dictionary to be # downloaded by NaCl Mozc. diff --git a/src/prediction/dictionary_predictor.cc b/src/prediction/dictionary_predictor.cc index fd01cde0b..75dffe54a 100644 --- a/src/prediction/dictionary_predictor.cc +++ b/src/prediction/dictionary_predictor.cc @@ -58,6 +58,7 @@ #include "dictionary/pos_matcher.h" #include "prediction/predictor_interface.h" #include "prediction/suggestion_filter.h" +#include "prediction/zero_query_data.h" #include "prediction/zero_query_number_data.h" #include "session/commands.pb.h" @@ -98,22 +99,22 @@ void GetNumberSuffixArray(const string &history_input, int default_num = -1; int suffix_num = -1; - for (int i = 0; ZeroQueryNum[i]; ++i) { - if (default_str == ZeroQueryNum[i][0]) { + for (int i = 0; i < kZeroQueryNum_size; ++i) { + if (default_str == kZeroQueryNum_data[i][0]) { default_num = i; - } else if (history_input == ZeroQueryNum[i][0]) { + } else if (history_input == kZeroQueryNum_data[i][0]) { suffix_num = i; } } DCHECK_GE(default_num, 0); if (suffix_num != -1) { - for (int j = 1; ZeroQueryNum[suffix_num][j]; ++j) { - suffixes->push_back(ZeroQueryNum[suffix_num][j]); + for (int j = 1; kZeroQueryNum_data[suffix_num][j]; ++j) { + suffixes->push_back(kZeroQueryNum_data[suffix_num][j]); } } - for (int j = 1; ZeroQueryNum[default_num][j]; ++j) { - suffixes->push_back(ZeroQueryNum[default_num][j]); + for (int j = 1; kZeroQueryNum_data[default_num][j]; ++j) { + suffixes->push_back(kZeroQueryNum_data[default_num][j]); } } @@ -163,6 +164,11 @@ bool IsTypingCorrectionEnabled() { FLAGS_enable_typing_correction; } +struct ZeroQueryRuleCompare { + bool operator()(const char **lhs, const char **rhs) const { + return (strcmp(lhs[0], rhs[0]) < 0); + } +}; } // namespace class DictionaryPredictor::PredictiveLookupCallback : @@ -1617,6 +1623,88 @@ void DictionaryPredictor::GetPredictiveResultsUsingTypingCorrection( } } +// Returns true if we add zero query result. +bool DictionaryPredictor::AggregateNumberZeroQueryPrediction( + const Segments &segments, vector *results) const { + string number_key; + if (!GetNumberHistory(segments, &number_key)) { + return false; + } + + // Use number suffixes and do not add normal zero query. + vector suffixes; + GetNumberSuffixArray(number_key, &suffixes); + DCHECK_GT(suffixes.size(), 0); + int cost = 0; + + for (size_t i = 0; i < suffixes.size(); ++i) { + const auto &suffix = suffixes[i]; + // Increment cost to show the candidates in order. + const int kSuffixPenalty = 10; + + results->push_back(Result()); + Result *result = &results->back(); + result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE); + result->key = suffix; + result->value = suffix; + result->wcost = cost; + result->lid = counter_suffix_word_id_; + result->rid = counter_suffix_word_id_; + + cost += kSuffixPenalty; + } + return true; +} + +// Returns true if we add zero query result. +bool DictionaryPredictor::AggregateZeroQueryPrediction( + const Segments &segments, vector *results) const { + const size_t history_size = segments.history_segments_size(); + if (history_size <= 0) { + return false; + } + + const Segment &last_segment = segments.history_segment(history_size - 1); + DCHECK_GT(last_segment.candidates_size(), 0); + const string &history_value = last_segment.candidate(0).value; + + const char *key_item[] = {history_value.c_str(), 0}; + const char **key = key_item; + // kZeroQueryData_data is a 2-dimensional string array and + // sorted by the first string. + // For each string array, the first item is a key for zero query prediction, + // the rest items are candidates, and the last item is 0. + const char ***result_rule = + lower_bound( + kZeroQueryData_data, kZeroQueryData_data + kZeroQueryData_size, + key, ZeroQueryRuleCompare()); + if (result_rule == (kZeroQueryData_data + kZeroQueryData_size) || + history_value != (*result_rule)[0]) { + return false; + } + + int cost = 0; + for (int i = 1; (*result_rule)[i]; ++i) { + string candidate = (*result_rule)[i]; + + // Increment cost to show the candidates in order. + const int kPenalty = 10; + + results->push_back(Result()); + Result *result = &results->back(); + + result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE); + result->key = candidate; + result->value = candidate; + result->wcost = cost; + result->lid = 0; // EOS + result->rid = 0; // EOS + + cost += kPenalty; + } + return true; +} + void DictionaryPredictor::AggregateSuffixPrediction( PredictionTypes types, const ConversionRequest &request, @@ -1630,30 +1718,10 @@ void DictionaryPredictor::AggregateSuffixPrediction( const bool is_zero_query = segments.conversion_segment(0).key().empty(); if (is_zero_query) { - string number_key; - if (GetNumberHistory(segments, &number_key)) { - // Use number suffixes and do not add normal zero query. - vector suffixes; - GetNumberSuffixArray(number_key, &suffixes); - DCHECK_GT(suffixes.size(), 0); - int cost = 0; - - for (vector::const_iterator it = suffixes.begin(); - it != suffixes.end(); ++it) { - // Increment cost to show the candidates in order. - const int kSuffixPenalty = 10; - - results->push_back(Result()); - Result *result = &results->back(); - result->SetTypesAndTokenAttributes(SUFFIX, Token::NONE); - result->key = *it; - result->value = *it; - result->wcost = cost; - result->lid = counter_suffix_word_id_; - result->rid = counter_suffix_word_id_; - - cost += kSuffixPenalty; - } + if (AggregateNumberZeroQueryPrediction(segments, results)) { + return; + } + if (AggregateZeroQueryPrediction(segments, results)) { return; } // Fall through diff --git a/src/prediction/dictionary_predictor.h b/src/prediction/dictionary_predictor.h index 9c2a0e274..8c7ae694b 100644 --- a/src/prediction/dictionary_predictor.h +++ b/src/prediction/dictionary_predictor.h @@ -176,6 +176,12 @@ class DictionaryPredictor : public PredictorInterface { const Segments &segments, vector *results) const; + bool AggregateNumberZeroQueryPrediction(const Segments &segments, + vector *results) const; + + bool AggregateZeroQueryPrediction(const Segments &segments, + vector *result) const; + void ApplyPenaltyForKeyExpansion(const Segments &segments, vector *results) const; @@ -200,6 +206,7 @@ class DictionaryPredictor : public PredictorInterface { FRIEND_TEST(DictionaryPredictorTest, AggregateSuffixPrediction); FRIEND_TEST(DictionaryPredictorTest, ZeroQuerySuggestionAfterNumbers); FRIEND_TEST(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion); + FRIEND_TEST(DictionaryPredictorTest, TriggerZeroQuerySuggestion); FRIEND_TEST(DictionaryPredictorTest, GetHistoryKeyAndValue); FRIEND_TEST(DictionaryPredictorTest, RealtimeConversionStartingWithAlphabets); FRIEND_TEST(DictionaryPredictorTest, IsAggressiveSuggestion); diff --git a/src/prediction/dictionary_predictor_test.cc b/src/prediction/dictionary_predictor_test.cc index 9eed0c47d..995a8e185 100644 --- a/src/prediction/dictionary_predictor_test.cc +++ b/src/prediction/dictionary_predictor_test.cc @@ -2097,7 +2097,54 @@ TEST_F(DictionaryPredictorTest, TriggerNumberZeroQuerySuggestion) { break; } } - EXPECT_EQ(test_case.expected_result, found); + EXPECT_EQ(test_case.expected_result, found) << test_case.history_value; + } +} + +TEST_F(DictionaryPredictorTest, TriggerZeroQuerySuggestion) { + scoped_ptr data_and_predictor( + CreateDictionaryPredictorWithMockData()); + const DictionaryPredictor *predictor = + data_and_predictor->dictionary_predictor(); + const ConversionRequest conversion_request; + + const struct TestCase { + const char *history_key; + const char *history_value; + const char *find_value; + bool expected_result; + } kTestCases[] = { + { "@", "@", + "gmail.com", true }, + { "!", "!", + "?", false }, + }; + + for (size_t i = 0; i < arraysize(kTestCases); ++i) { + Segments segments; + MakeSegmentsForSuggestion("", &segments); + + const TestCase &test_case = kTestCases[i]; + PrependHistorySegments( + test_case.history_key, test_case.history_value, &segments); + vector results; + predictor->AggregateSuffixPrediction( + DictionaryPredictor::SUFFIX, + conversion_request, segments, &results); + EXPECT_FALSE(results.empty()); + + bool found = false; + for (vector::const_iterator it = + results.begin(); + it != results.end(); ++it) { + EXPECT_EQ(it->types, DictionaryPredictor::SUFFIX); + if (it->value == test_case.find_value && + it->lid == 0 /* EOS */) { + found = true; + break; + } + } + EXPECT_EQ(test_case.expected_result, found) << test_case.history_value; } } diff --git a/src/prediction/gen_embedded_string_array_for_zero_query.py b/src/prediction/gen_embedded_string_array_for_zero_query.py new file mode 100644 index 000000000..f149f3701 --- /dev/null +++ b/src/prediction/gen_embedded_string_array_for_zero_query.py @@ -0,0 +1,164 @@ +# -*- coding: utf-8 -*- +# Copyright 2010-2015, Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Generate header file of a string array for zero query suggestion. + +Usage: +gen_embedded_string_array_for_zero_query.py --input input.def \ + --output /path/to/output/zero_query_hoge.h --var_name ZeroQueryHoge + +Input format: + ,,.., +... +For more details, please refer to definition files under mozc/data/zero_query/ + +Output format: +const char *Var0[] = {"Key0", "Cand00", "Cand01", .., 0}; +const char *Var1[] = {"Key1", "Cand10", "Cand11", .., 0}; + +const char **Var[] = {Var0, Var1, .., VarN}; + +Here, (Cand00, Cand10, ...) is sorted so that we can use binary search. +""" + +__author__ = "toshiyuki" + +import optparse +import os + + +_MOZC_DIR_FOR_DEFINE_GUARD = 'MOZC' + + +def EscapeString(string): + """Escapes string.""" + return '"' + string.encode('string_escape') + '"' + + +def GetDefineGuardSymbol(file_name): + """Returns define guard symbol for .h file. + + For example, returns 'SOME_EXAMPLE_H' for '/path/to/some_example.h' + + Args: + file_name: a string indicating output file path. + Returns: + A string for define guard. + """ + return os.path.basename(file_name).upper().replace('.', '_') + + +def GetDefineGuardHeaderLines(output_file_name): + """Returns define guard header for .h file.""" + result = [] + result.append( + '#ifndef %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD, + GetDefineGuardSymbol(output_file_name))) + result.append( + '#define %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD, + GetDefineGuardSymbol(output_file_name))) + return result + + +def GetDefineGuardFooterLines(output_file_name): + """Returns define guard footer for .h file.""" + return [ + '#endif // %s_PREDICTION_%s_' %(_MOZC_DIR_FOR_DEFINE_GUARD, + GetDefineGuardSymbol(output_file_name))] + + +def GetZeroQueryRules(input_file_name): + """Returns zero query trigerring rules. The list is sorted by key.""" + rules = [] + with open(input_file_name, 'r') as input_file: + for line in input_file: + if line.startswith('#'): + continue + line = line.rstrip('\r\n') + if not line: + continue + + tokens = line.split('\t') + key = tokens[0] + values = tokens[1].split(',') + + rules.append((key, values)) + rules.sort(lambda x, y: cmp(x[0], y[0])) # For binary search + return rules + + +def GetHeaderContents(input_file_name, var_name, output_file_name): + """Returns contents for header file that contains a string array.""" + zero_query_rules = GetZeroQueryRules(input_file_name) + + result = [] + result.extend(GetDefineGuardHeaderLines(output_file_name)) + result.append('namespace mozc {') + result.append('namespace {') + + for i, rule in enumerate(zero_query_rules): + result.append('const char *%s%d[] = {' % (var_name, i)) + result.append(' ' + ', '.join( + [EscapeString(s) for s in [rule[0]] + rule[1]] + ['0'])) + result.append('};') + + result.append('} // namespace') + + result.append('const char **%s_data[] = {' % var_name) + result.append(' ' + ', '.join( + ['%s%d' % (var_name, c) for c in range(len(zero_query_rules))])) + result.append('};') + result.append( + 'const size_t %s_size = %d;' % (var_name, len(zero_query_rules))) + + result.append('} // namespace mozc') + result.extend(GetDefineGuardFooterLines(output_file_name)) + return result + + +def ParseOption(): + """Parses command line options.""" + parser = optparse.OptionParser() + parser.add_option('--input', dest='input', help='Input file path') + parser.add_option('--output', dest='output', help='Output file path') + parser.add_option( + '--var_name', dest='var_name', help='Var name for the array') + return parser.parse_args()[0] + + +def main(): + options = ParseOption() + lines = GetHeaderContents(options.input, options.var_name, options.output) + with open(options.output, 'w') as out_file: + out_file.write('\n'.join(lines)) + + +if __name__ == '__main__': + main() diff --git a/src/prediction/gen_zero_query_number_data.py b/src/prediction/gen_zero_query_number_data.py deleted file mode 100644 index 724b177cf..000000000 --- a/src/prediction/gen_zero_query_number_data.py +++ /dev/null @@ -1,72 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright 2010-2015, Google Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following disclaimer -# in the documentation and/or other materials provided with the -# distribution. -# * Neither the name of Google Inc. nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -__author__ = "manabe" - -import sys - - -def EscapeString(result): - return '"' + result.encode('string_escape') + '"' - - -def main(): - print "#ifndef MOZC_PREDICTION_ZERO_QUERY_NUM_DATA_H_" - print "#define MOZC_PREDICTION_ZERO_QUERY_NUM_DATA_H_" - print "namespace mozc {" - print "namespace {" - - count = 0 - for line in open(sys.argv[1], "r"): - if line.startswith("#"): - continue - - line = line.rstrip("\r\n") - if line == "": - continue - - fields = line.split("\t") - key = fields[0] - values = [key] + fields[1].split(",") - print "const char *ZeroQueryNum%d[] = {" % count - print " " + ", ".join([EscapeString(s) for s in values] + ["0"]) - print "};" - count += 1 - - print "} // namespace" - print "const char **ZeroQueryNum[] = {" - print " " + ", ".join(["ZeroQueryNum%d" % c for c in range(count)] + ["0"]) - print "};" - print "} // namespace mozc" - print "#endif // MOZC_PREDICTION_ZERO_QUERY_NUM_DATA_H_" - - -if __name__ == "__main__": - main() diff --git a/src/prediction/prediction.gyp b/src/prediction/prediction.gyp index a9761d1f8..21badd84c 100644 --- a/src/prediction/prediction.gyp +++ b/src/prediction/prediction.gyp @@ -58,6 +58,7 @@ '../session/session_base.gyp:session_protocol', '../storage/storage.gyp:storage', '../usage_stats/usage_stats_base.gyp:usage_stats', + 'gen_zero_query_data#host', 'gen_zero_query_number_data#host', 'prediction_base.gyp:suggestion_filter', 'prediction_protocol', @@ -76,22 +77,51 @@ ], }, 'inputs': [ - 'gen_zero_query_number_data.py', + 'gen_embedded_string_array_for_zero_query.py', '<@(input_files)', ], 'outputs': [ '<(gen_out_dir)/zero_query_number_data.h', ], 'action': [ - 'python', '../build_tools/redirect.py', - '<(gen_out_dir)/zero_query_number_data.h', - 'gen_zero_query_number_data.py', - '<@(input_files)', + 'python', 'gen_embedded_string_array_for_zero_query.py', + '--input=<@(input_files)', + '--var_name=kZeroQueryNum', + '--output=<(gen_out_dir)/zero_query_number_data.h', ], 'message': 'Generating <(gen_out_dir)/zero_query_number_data.h', }, ], }, + { + 'target_name': 'gen_zero_query_data', + 'type': 'none', + 'toolsets': ['host'], + 'actions': [ + { + 'action_name': 'gen_zero_query_data', + 'variables': { + 'input_files': [ + '../data/zero_query/zero_query.def', + ], + }, + 'inputs': [ + 'gen_embedded_string_array_for_zero_query.py', + '<@(input_files)', + ], + 'outputs': [ + '<(gen_out_dir)/zero_query_data.h', + ], + 'action': [ + 'python', 'gen_embedded_string_array_for_zero_query.py', + '--input=<@(input_files)', + '--var_name=kZeroQueryData', + '--output=<(gen_out_dir)/zero_query_data.h', + ], + 'message': 'Generating <(gen_out_dir)/zero_query_data.h', + }, + ], + }, { 'target_name': 'genproto_prediction', 'type': 'none',