Skip to content

Commit

Permalink
refactor(algo, dict, translators): use logarithmic weights internally
Browse files Browse the repository at this point in the history
  • Loading branch information
lotem committed Feb 13, 2019
1 parent d3b8ca1 commit 667da57
Show file tree
Hide file tree
Showing 20 changed files with 69 additions and 64 deletions.
2 changes: 1 addition & 1 deletion src/rime/algo/algebra.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void Script::Merge(const string& s,
{
if (sp.type > yy.type)
yy.type = sp.type;
yy.credibility *= sp.credibility;
yy.credibility += sp.credibility;
if (!sp.tips.empty())
yy.tips = sp.tips;
}
Expand Down
8 changes: 4 additions & 4 deletions src/rime/algo/calculus.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

namespace rime {

const double kAbbreviationPenalty = 0.5;
const double kFuzzySpellingPenalty = 0.5;
const double kAbbreviationPenalty = -0.6931471805599453; // log(0.5)
const double kFuzzySpellingPenalty = -0.6931471805599453; // log(0.5)

Calculus::Calculus() {
Register("xlit", &Transliteration::Parse);
Expand Down Expand Up @@ -177,7 +177,7 @@ bool Fuzzing::Apply(Spelling* spelling) {
bool result = Transformation::Apply(spelling);
if (result) {
spelling->properties.type = kFuzzySpelling;
spelling->properties.credibility *= kFuzzySpellingPenalty;
spelling->properties.credibility += kFuzzySpellingPenalty;
}
return result;
}
Expand All @@ -201,7 +201,7 @@ bool Abbreviation::Apply(Spelling* spelling) {
bool result = Transformation::Apply(spelling);
if (result) {
spelling->properties.type = kAbbreviation;
spelling->properties.credibility *= kAbbreviationPenalty;
spelling->properties.credibility += kAbbreviationPenalty;
}
return result;
}
Expand Down
2 changes: 1 addition & 1 deletion src/rime/algo/spelling.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ enum SpellingType { kNormalSpelling, kFuzzySpelling,
struct SpellingProperties {
SpellingType type = kNormalSpelling;
size_t end_pos = 0;
double credibility = 1.0;
double credibility = 0.0;
string tips;
};

Expand Down
11 changes: 7 additions & 4 deletions src/rime/algo/syllabifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ using VertexQueue = std::priority_queue<Vertex,
vector<Vertex>,
std::greater<Vertex>>;

const double kCompletionPenalty = -0.6931471805599453; // log(0.5)
const double kCorrectionCredibility = -4.605170185988091; // log(0.01)

int Syllabifier::BuildSyllableGraph(const string &input,
Prism &prism,
SyllableGraph *graph) {
Expand Down Expand Up @@ -99,7 +102,7 @@ int Syllabifier::BuildSyllableGraph(const string &input,
// spelling-to-syllable map
if (match_set.find(m.value) == match_set.end()) {
props.is_correction = true;
props.credibility = 0.01;
props.credibility = kCorrectionCredibility;
}
auto it = spellings.find(syllable_id);
if (it == spellings.end()) {
Expand Down Expand Up @@ -207,7 +210,7 @@ int Syllabifier::BuildSyllableGraph(const string &input,
SpellingProperties props = accessor.properties();
if (props.type < kAbbreviation) {
props.type = kCompletion;
props.credibility *= 0.5;
props.credibility += kCompletionPenalty;
props.end_pos = end_pos;
// add a syllable with properties to the edge's
// spelling-to-syllable map
Expand Down Expand Up @@ -240,7 +243,7 @@ int Syllabifier::BuildSyllableGraph(const string &input,

void Syllabifier::CheckOverlappedSpellings(SyllableGraph *graph,
size_t start, size_t end) {
const double kPenaltyForAmbiguousSyllable = 1e-10;
const double kPenaltyForAmbiguousSyllable = -23.025850929940457; // log(1e-10)
if (!graph || graph->edges.find(start) == graph->edges.end())
return;
// if "Z" = "YX", mark the vertex between Y and X an ambiguous syllable joint
Expand All @@ -259,7 +262,7 @@ void Syllabifier::CheckOverlappedSpellings(SyllableGraph *graph,
// discourage syllables at an ambiguous joint
// bad cases include pinyin syllabification "niju'ede"
for (auto& spelling : x.second) {
spelling.second.credibility *= kPenaltyForAmbiguousSyllable;
spelling.second.credibility += kPenaltyForAmbiguousSyllable;
}
graph->vertices[joint] = kAmbiguousSpelling;
DLOG(INFO) << "ambiguous syllable joint at position " << joint << ".";
Expand Down
3 changes: 2 additions & 1 deletion src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// 2011-11-27 GONG Chen <[email protected]>
//
#include <boost/filesystem.hpp>
#include <cfloat>
#include <fstream>
#include <rime/algo/algebra.h>
#include <rime/algo/utilities.h>
Expand Down Expand Up @@ -176,7 +177,7 @@ bool DictCompiler::BuildTable(DictSettings* settings,
auto e = New<DictEntry>();
e->code.swap(code);
e->text.swap(r.text);
e->weight = r.weight;
e->weight = log(r.weight > 0 ? r.weight : DBL_EPSILON);
ls->push_back(e);
}
if (settings->sort_order() != "original") {
Expand Down
10 changes: 5 additions & 5 deletions src/rime/dict/dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ bool compare_chunk_by_head_element(const Chunk& a, const Chunk& b) {
if (!b.entries || b.cursor >= b.size) return true;
if (a.remaining_code.length() != b.remaining_code.length())
return a.remaining_code.length() < b.remaining_code.length();
return a.credibility * a.entries[a.cursor].weight >
b.credibility * b.entries[b.cursor].weight; // by weight desc
return a.credibility + a.entries[a.cursor].weight >
b.credibility + b.entries[b.cursor].weight; // by weight desc
}

size_t match_extra_code(const table::Code* extra_code, size_t depth,
Expand Down Expand Up @@ -87,8 +87,8 @@ an<DictEntry> DictEntryIterator::Peek() {
entry_ = New<DictEntry>();
entry_->code = chunk.code;
entry_->text = table_->GetEntryText(e);
const double kS = 1e8;
entry_->weight = (e.weight + 1) / kS * chunk.credibility;
const double kS = 18.420680743952367; // log(1e8)
entry_->weight = e.weight - kS + chunk.credibility;
if (!chunk.remaining_code.empty()) {
entry_->comment = "~" + chunk.remaining_code;
entry_->remaining_code_length = chunk.remaining_code.length();
Expand Down Expand Up @@ -167,7 +167,7 @@ Dictionary::Lookup(const SyllableGraph& syllable_graph,
for (auto& v : result) {
size_t end_pos = v.first;
for (TableAccessor& a : v.second) {
double cr = initial_credibility * a.credibility();
double cr = initial_credibility + a.credibility();
if (a.extra_code()) {
do {
size_t actual_end_pos = dictionary::match_extra_code(
Expand Down
10 changes: 5 additions & 5 deletions src/rime/dict/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ struct Chunk {
size_t size = 0;
size_t cursor = 0;
string remaining_code; // for predictive queries
double credibility = 1.0;
double credibility = 0.0;

Chunk() = default;
Chunk(const Code& c, const table::Entry* e, double cr = 1.0)
Chunk(const Code& c, const table::Entry* e, double cr = 0.0)
: code(c), entries(e), size(1), cursor(0), credibility(cr) {}
Chunk(const TableAccessor& a, double cr = 1.0)
Chunk(const TableAccessor& a, double cr = 0.0)
: Chunk(a, string(), cr) {}
Chunk(const TableAccessor& a, const string& r, double cr = 1.0)
Chunk(const TableAccessor& a, const string& r, double cr = 0.0)
: code(a.index_code()), entries(a.entry()),
size(a.remaining()), cursor(0), remaining_code(r), credibility(cr) {}
};
Expand Down Expand Up @@ -88,7 +88,7 @@ class Dictionary : public Class<Dictionary, const Ticket&> {

RIME_API an<DictEntryCollector> Lookup(const SyllableGraph& syllable_graph,
size_t start_pos,
double initial_credibility = 1.0);
double initial_credibility = 0.0);
// if predictive is true, do an expand search with limit,
// otherwise do an exact match.
// return num of matching keys.
Expand Down
2 changes: 1 addition & 1 deletion src/rime/dict/prism.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ struct node_t {

} // namespace

const char kPrismFormat[] = "Rime::Prism/2.0";
const char kPrismFormat[] = "Rime::Prism/3.0";

const char kPrismFormatPrefix[] = "Rime::Prism/";
const size_t kPrismFormatPrefixLen = sizeof(kPrismFormatPrefix) - 1;
Expand Down
10 changes: 5 additions & 5 deletions src/rime/dict/reverse_lookup_dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

namespace rime {

const char kReverseFormat[] = "Rime::Reverse/2.0";
const char kReverseFormat[] = "Rime::Reverse/3.0";

const char kReverseFormatPrefix[] = "Rime::Reverse/";
const size_t kReverseFormatPrefixLen = sizeof(kReverseFormatPrefix) - 1;
Expand Down Expand Up @@ -102,16 +102,16 @@ bool ReverseDb::Build(DictSettings* settings,
for (const auto& v : rev_table) {
const string& key(v.first);
string value(boost::algorithm::join(v.second, " "));
key_trie_builder.Add(key, 1.0, &key_ids[i]);
value_trie_builder.Add(value, 1.0, &value_ids[i]);
key_trie_builder.Add(key, 0.0, &key_ids[i]);
value_trie_builder.Add(value, 0.0, &value_ids[i]);
++i;
}
// save stems
for (const auto& v : stems) {
string key(v.first + kStemKeySuffix);
string value(boost::algorithm::join(v.second, " "));
key_trie_builder.Add(key, 1.0, &key_ids[i]);
value_trie_builder.Add(value, 1.0, &value_ids[i]);
key_trie_builder.Add(key, 0.0, &key_ids[i]);
value_trie_builder.Add(value, 0.0, &value_ids[i]);
++i;
}
key_trie_builder.Build();
Expand Down
14 changes: 7 additions & 7 deletions src/rime/dict/table.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@

namespace rime {

const char kTableFormatLatest[] = "Rime::Table/3.0";
int kTableFormatLowestCompatible = 3.0;
const char kTableFormatLatest[] = "Rime::Table/4.0";
int kTableFormatLowestCompatible = 4.0;

const char kTableFormatPrefix[] = "Rime::Table/";
const size_t kTableFormatPrefixLen = sizeof(kTableFormatPrefix) - 1;
Expand All @@ -28,10 +28,10 @@ class TableQuery {
}

TableAccessor Access(SyllableId syllable_id,
double credibility = 1.0) const;
double credibility = 0.0) const;

// down to next level
bool Advance(SyllableId syllable_id, double credibility = 1.0);
bool Advance(SyllableId syllable_id, double credibility = 0.0);

// up one level
bool Backdate();
Expand Down Expand Up @@ -136,7 +136,7 @@ bool TableQuery::Advance(SyllableId syllable_id, double credibility) {
}
++level_;
index_code_.push_back(syllable_id);
credibility_.push_back(credibility_.back() * credibility);
credibility_.push_back(credibility_.back() + credibility);
return true;
}

Expand All @@ -155,7 +155,7 @@ void TableQuery::Reset() {
level_ = 0;
index_code_.clear();
credibility_.clear();
credibility_.push_back(1.0);
credibility_.push_back(0.0);
}

inline static bool node_less(const table::TrunkIndexNode& a,
Expand Down Expand Up @@ -216,7 +216,7 @@ inline static Code add_syllable(Code code, SyllableId syllable_id) {

TableAccessor TableQuery::Access(SyllableId syllable_id,
double credibility) const {
credibility *= credibility_.back();
credibility += credibility_.back();
if (level_ == 0) {
if (!lv1_index_ ||
syllable_id < 0 ||
Expand Down
8 changes: 4 additions & 4 deletions src/rime/dict/table.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ class TableAccessor {
public:
TableAccessor() = default;
TableAccessor(const Code& index_code, const List<table::Entry>* entries,
double credibility = 1.0);
double credibility = 0.0);
TableAccessor(const Code& index_code, const Array<table::Entry>* entries,
double credibility = 1.0);
double credibility = 0.0);
TableAccessor(const Code& index_code, const table::TailIndex* code_map,
double credibility = 1.0);
double credibility = 0.0);

RIME_API bool Next();

Expand All @@ -124,7 +124,7 @@ class TableAccessor {
const table::LongEntry* long_entries_ = nullptr;
size_t size_ = 0;
size_t cursor_ = 0;
double credibility_ = 1.0;
double credibility_ = 0.0;
};

using TableQueryResult = map<int, vector<TableAccessor>>;
Expand Down
12 changes: 7 additions & 5 deletions src/rime/dict/user_dictionary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// 2011-10-30 GONG Chen <[email protected]>
//
#include <algorithm>
#include <cfloat>
#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/scope_exit.hpp>
Expand Down Expand Up @@ -225,7 +226,7 @@ void UserDictionary::DfsLookup(const SyllableGraph& syll_graph,
if (i > 0 && props->type >= kAbbreviation)
continue;
state->credibility.push_back(
state->credibility.back() * props->credibility);
state->credibility.back() + props->credibility);
BOOST_SCOPE_EXIT( (&state) ) {
state->credibility.pop_back();
}
Expand Down Expand Up @@ -479,10 +480,11 @@ an<DictEntry> UserDictionary::CreateDictEntry(const string& key,
e->text = key.substr(separator_pos + 1);
e->commit_count = v.commits;
// TODO: argument s not defined...
e->weight = algo::formula_p(0,
(double)v.commits / present_tick,
(double)present_tick,
v.dee) * credibility;
double weight = algo::formula_p(0,
(double)v.commits / present_tick,
(double)present_tick,
v.dee);
e->weight = log(weight > 0 ? weight : DBL_EPSILON) + credibility;
if (full_code) {
*full_code = key.substr(0, separator_pos);
}
Expand Down
14 changes: 7 additions & 7 deletions src/rime/dict/user_dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ class UserDictionary : public Class<UserDictionary, const Ticket&> {
bool readonly() const;

an<UserDictEntryCollector> Lookup(const SyllableGraph& syllable_graph,
size_t start_pos,
size_t depth_limit = 0,
double initial_credibility = 1.0);
size_t start_pos,
size_t depth_limit = 0,
double initial_credibility = 0.0);
size_t LookupWords(UserDictEntryIterator* result,
const string& input,
bool predictive,
Expand All @@ -83,10 +83,10 @@ class UserDictionary : public Class<UserDictionary, const Ticket&> {
TickCount tick() const { return tick_; }

static an<DictEntry> CreateDictEntry(const string& key,
const string& value,
TickCount present_tick,
double credibility = 1.0,
string* full_code = NULL);
const string& value,
TickCount present_tick,
double credibility = 0.0,
string* full_code = NULL);

protected:
bool Initialize();
Expand Down
12 changes: 6 additions & 6 deletions src/rime/gear/script_translator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,9 @@ void ScriptTranslation::PrepareCandidate() {
start_,
start_ + user_phrase_code_length,
entry);
cand->set_quality(entry->weight +
translator_->initial_quality() +
(IsNormalSpelling() ? 0.5 : -0.5));
cand->set_quality(exp(entry->weight) +
translator_->initial_quality() +
(IsNormalSpelling() ? 0.5 : -0.5));
}
else if (phrase_code_length > 0) {
DictEntryIterator& iter(phrase_iter_->second);
Expand All @@ -486,9 +486,9 @@ void ScriptTranslation::PrepareCandidate() {
start_,
start_ + phrase_code_length,
entry);
cand->set_quality(entry->weight +
translator_->initial_quality() +
(IsNormalSpelling() ? 0 : -1));
cand->set_quality(exp(entry->weight) +
translator_->initial_quality() +
(IsNormalSpelling() ? 0 : -1));
}
candidate_ = cand;
}
Expand Down
2 changes: 1 addition & 1 deletion src/rime/gear/table_translator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ an<Candidate> TableTranslation::Peek() {
phrase->set_comment(comment);
phrase->set_preedit(preedit_);
bool incomplete = e->remaining_code_length != 0;
phrase->set_quality(e->weight +
phrase->set_quality(exp(e->weight) +
options_->initial_quality() +
(incomplete ? -1 : 0) +
(is_user_phrase ? 0.5 : 0));
Expand Down
5 changes: 2 additions & 3 deletions src/rime/gear/translator_commons.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,11 @@ bool Spans::HasVertex(size_t vertex) const {
// Sentence

void Sentence::Extend(const DictEntry& entry, size_t end_pos) {
const double kEpsilon = 1e-200;
const double kPenalty = 1e-8;
const double kPenalty = -18.420680743952367; // log(1e-8)
entry_->code.insert(entry_->code.end(),
entry.code.begin(), entry.code.end());
entry_->text.append(entry.text);
entry_->weight *= (std::max)(entry.weight, kEpsilon) * kPenalty;
entry_->weight += entry.weight + kPenalty;
components_.push_back(entry);
syllable_lengths_.push_back(end_pos - end());
set_end(end_pos);
Expand Down
Loading

0 comments on commit 667da57

Please sign in to comment.