Skip to content

Commit

Permalink
maiden: optional globally fuzzy search; add element hack
Browse files Browse the repository at this point in the history
Fuzzy search is pointless with clients that apply their own sort order
(and especially use an unstable sort, like element)

Hack to work around matrix-org/matrix-react-sdk#9556
  • Loading branch information
maxmalek committed Nov 29, 2022
1 parent 52818c7 commit 8d66a7c
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 29 deletions.
12 changes: 5 additions & 7 deletions binmaiden/maiden.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,13 @@
"search": {
"fields": {
// field => true to search with default params
// - default is partial/substring search
// field => {configs} for extended search
// config fields:
// - fuzzy: true to enable fuzzy search for this key
"mail": true,
"phone": true,
"realname": { "fuzzy": true }
"realname": true,
},
"displayname": "displayname",
"fuzzy": false, // Makes the search fuzzy. Best matches first. Useless+confusing if clients apply their own sort order.
"element_hack": true, // If query is not part of a result, add it. Reqired to show mismatched result in Element.
"displayname": "displayname", // Field to show as the display name
//"avatar_url": "mxc://...",
"homeserver": { "host": "localhost", "port": 8008, "ssl": false },
"reverseproxy": true,
Expand All @@ -53,7 +51,7 @@
},
},
// Optionally, serve .well-known if no other service is serving it already
// Clients espect this on https (port 443) with a valid cert.
// Clients expect this on https (port 443) with a valid cert.
{
"listen": [
{ "port": 443, "ssl": true },
Expand Down
12 changes: 7 additions & 5 deletions dep/fts_fuzzy_match.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@


#include <cstdint> // uint8_t
#include <ctype.h> // ::tolower, ::toupper
//#include <ctype.h> // ::tolower, ::toupper
#include <cstring> // memcpy

#include <cstdio>
Expand All @@ -58,7 +58,8 @@ namespace fts {
// Public interface
bool fuzzy_match_simple(char const * pattern, char const * str) {
while (*pattern != '\0' && *str != '\0') {
if (tolower(*pattern) == tolower(*str))
//if (tolower(*pattern) == tolower(*str))
if ((*pattern) == (*str))
++pattern;
++str;
}
Expand Down Expand Up @@ -103,7 +104,8 @@ namespace fts {
while (*pattern != '\0' && *str != '\0') {

// Found match
if (tolower(*pattern) == tolower(*str)) {
//if (tolower(*pattern) == tolower(*str)) {
if ((*pattern) == (*str)) {

// Supplied matches buffer was too short
if (nextMatch >= maxMatches)
Expand Down Expand Up @@ -183,8 +185,8 @@ namespace fts {
// Camel case
char neighbor = strBegin[currIdx - 1];
char curr = strBegin[currIdx];
if (curr > 0 && neighbor > 0 && ::islower(neighbor) && ::isupper(curr))
outScore += camel_bonus;
//if (curr > 0 && neighbor > 0 && ::islower(neighbor) && ::isupper(curr))
// outScore += camel_bonus;

// Separator
bool neighborSeparator = neighbor == '_' || neighbor == ' ';
Expand Down
6 changes: 4 additions & 2 deletions src/maiden/mxsearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void MxSearch::rebuildCache(const MxStore & mxs)
printf("MxSearch::rebuildCache() done after %u ms\n", (unsigned)timer.ms());
}

MxSearch::Matches MxSearch::searchExact(const MxMatcherList& matchers) const
MxSearch::Matches MxSearch::search(const MxMatcherList& matchers, bool fuzzy) const
{
std::shared_lock lock(mutex);
//-----------------------------------------------------------
Expand All @@ -95,6 +95,8 @@ MxSearch::Matches MxSearch::searchExact(const MxMatcherList& matchers) const
for(size_t i = 0; i < N; ++i)
{
int score = mxMatchAndScore_Exact(_strings[i].s, _strings[i].len, matchers.data(), matchers.size());
if(fuzzy)
score += mxMatchAndScore_Fuzzy(_strings[i].s, matchers.data(), matchers.size());
if(score > 0)
{
Match m;
Expand All @@ -103,7 +105,7 @@ MxSearch::Matches MxSearch::searchExact(const MxMatcherList& matchers) const
hits.push_back(m);
}
}
printf("MxSearch::searchExact() took %u ms\n", (unsigned)timer.ms());
printf("MxSearch::search() took %u ms\n", (unsigned)timer.ms());
return hits;
}

Expand Down
8 changes: 5 additions & 3 deletions src/maiden/mxsearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ class TwoWayMatcher;

struct MxSearchConfig
{
struct Field
struct Field // Affects how the search cache is generated from each field
{
bool fuzzy = false;
//bool fuzzy = false; // No longer possible because we're not searching individual fields anymore
};
typedef std::unordered_map<std::string, Field> Fields;
Fields fields;
Expand All @@ -25,6 +25,8 @@ struct MxSearchConfig
// -- below here is not used by mxstore --
std::string avatar_url;
size_t maxsize = 1024; // max. size of search request, json and all
bool fuzzy = false;
bool element_hack = false;
};


Expand Down Expand Up @@ -56,7 +58,7 @@ class MxSearch : public EvTreeRebuilt

typedef std::vector<Match> Matches;

Matches searchExact(const MxMatcherList& matchers) const;
Matches search(const MxMatcherList& matchers, bool fuzzy) const;

// Inherited via EvTreeRebuilt
virtual void onTreeRebuilt(const MxStore& mxs) override;
Expand Down
13 changes: 13 additions & 0 deletions src/maiden/mxsearchalgo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "strmatch.h"
#include <assert.h>
#include <ctype.h>
#include "fts_fuzzy_match.h"

// TODO: this should be using some variation of two-way string matching
// see https://git.musl-libc.org/cgit/musl/tree/src/string/memmem.c
Expand Down Expand Up @@ -88,6 +89,18 @@ int mxMatchAndScore_Exact(const char *haystack, size_t haylen, const TwoWayCasef
return score;
}

int mxMatchAndScore_Fuzzy(const char* haystack, const TwoWayCasefoldMatcher* matchers, size_t nummatchers)
{
int score = 0;
for (size_t i = 0; i < nummatchers; ++i)
{
int bestmatch = 0;
if(fts::fuzzy_match(matchers[i].needle(), haystack, bestmatch))
score += bestmatch;
}
return score;
}

bool mxSearchNormalizeAppend(std::vector<unsigned char>& vec, const char* s, size_t len)
{
assert(len);
Expand Down
1 change: 1 addition & 0 deletions src/maiden/mxsearchalgo.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ MxMatcherList mxBuildMatchersForTerm(const char *term);
bool mxSearchNormalizeAppend(std::vector<unsigned char>& vec, const char *s, size_t len);

int mxMatchAndScore_Exact(const char *haystack, size_t haylen, const TwoWayCasefoldMatcher *matchers, size_t nummatchers);
int mxMatchAndScore_Fuzzy(const char *haystack, const TwoWayCasefoldMatcher *matchers, size_t nummatchers);
25 changes: 18 additions & 7 deletions src/maiden/mxservices.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,10 @@ MxSearchHandler::MxSearchHandler(MxStore& store, VarCRef cfg, MxSources& sources
}
else if(const Var::Map *fm = val.map())
{
VarCRef f(xfields.mem, &val);
/*VarCRef f(xfields.mem, &val);
VarCRef xfuzzy = f.lookup("fuzzy");
fcfg.fuzzy = xfuzzy && xfuzzy.asBool();
fcfg.fuzzy = xfuzzy && xfuzzy.asBool();*/
assert(false); // atm not supported
}
else
{
Expand All @@ -108,6 +109,12 @@ MxSearchHandler::MxSearchHandler(MxStore& store, VarCRef cfg, MxSources& sources
}
}

if (VarCRef xfuzzy = cfg.lookup("fuzzy"))
searchcfg.fuzzy = xfuzzy && xfuzzy.asBool();

if (VarCRef xeh = cfg.lookup("element_hack"))
searchcfg.element_hack = xeh && xeh.asBool();

if(VarCRef xurl = cfg.lookup("avatar_url"))
if(const char *url = xurl.asCString())
searchcfg.avatar_url = url;
Expand Down Expand Up @@ -147,15 +154,16 @@ MxSearchHandler::MxSearchHandler(MxStore& store, VarCRef cfg, MxSources& sources
printf("MxSearchHandler: max. client request size = %u\n", (unsigned)searchcfg.maxsize);
printf("MxSearchHandler: avatar_url = %s\n", searchcfg.avatar_url.c_str());
printf("MxSearchHandler: displayname = %s\n", searchcfg.displaynameField.c_str());
printf("MxSearchHandler: fuzzy global search = %d\n", searchcfg.fuzzy);
printf("MxSearchHandler: Element substring HACK = %d\n", searchcfg.element_hack);
printf("MxSearchHandler: searching %u fields:\n", (unsigned)searchcfg.fields.size());
for(MxSearchConfig::Fields::iterator it = searchcfg.fields.begin(); it != searchcfg.fields.end(); ++it)
printf(" + %s [fuzzy = %u]\n", it->first.c_str(), it->second.fuzzy);
printf(" + %s\n", it->first.c_str());
printf("MxSearchHandler: Reverse proxy enabled: %s\n", reverseproxy ? "yes" : "no");
printf("MxSearchHandler: Ask homeserver: %s\n", askHS ? "yes" : "no");
printf("MxSearchHandler: Ask homeserver timeout: %d ms\n", hsTimeout);
printf("MxSearchHandler: Check homeserver: %s\n", checkHS ? "yes" : "no");

// FIXME: remove this again in dtor
sources.addListener(&this->search);
}

Expand All @@ -174,7 +182,7 @@ void MxSearchHandler::doSearch(VarRef dst, const char* term, size_t limit) const
puts(os.str().c_str());


MxSearch::Matches hits = search.searchExact(matchers);
MxSearch::Matches hits = search.search(matchers, searchcfg.fuzzy);

// keep best matches, drop the rest if above the limit
bool limited = false;
Expand All @@ -186,20 +194,23 @@ void MxSearchHandler::doSearch(VarRef dst, const char* term, size_t limit) const
}

// resolve matches to something readable
const MxStore::SearchResults results = _store.formatMatches(searchcfg, hits.data(), hits.size());
MxStore::SearchResults results = _store.formatMatches(searchcfg, hits.data(), hits.size());

dst.makeMap().v->map()->clear(*dst.mem); // make sure it's an empty map

dst["limited"] = limited;
VarRef ra = dst["results"].makeArray(results.size());

const bool useAvatar = !searchcfg.avatar_url.empty();
const bool elementHack = searchcfg.element_hack;
for (size_t i = 0; i < results.size(); ++i)
{
VarRef d = ra.at(i).makeMap();
const MxStore::SearchResult& r = results[i];
MxStore::SearchResult& r = results[i];
if (useAvatar)
d["avatar_url"] = searchcfg.avatar_url.c_str();
if(elementHack && !strstr(r.displayname.c_str(), term))
r.displayname = r.displayname + " // " + term;
if (!r.displayname.empty())
d["display_name"] = r.displayname.c_str();
d["user_id"] = r.str.c_str();
Expand Down
14 changes: 11 additions & 3 deletions src/maiden/mxsources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,9 +350,15 @@ static void _OnTreeRebuilt(EvTreeRebuilt *ev, const MxStore *mxs)

void MxSources::_sendTreeRebuiltEvent() const
{
std::vector<std::future<void> > futs(_evRebuilt.size());
for(size_t i = 0; i < _evRebuilt.size(); ++i)
futs[i] = std::move(std::async(_OnTreeRebuilt, _evRebuilt[i], &_store));
std::vector<std::future<void> > futs;
{
std::unique_lock lock(_eventlock);
//----------------------------------
futs.resize(_evRebuilt.size());
for(size_t i = 0; i < _evRebuilt.size(); ++i)
futs[i] = std::move(std::async(_OnTreeRebuilt, _evRebuilt[i], &_store));
}
// don't keep it locked while the futures finish
}

void MxSources::_updateEnv(VarCRef xenv)
Expand Down Expand Up @@ -394,6 +400,7 @@ void MxSources::_updateEnv(VarCRef xenv)
void MxSources::addListener(EvTreeRebuilt* ev)
{
std::unique_lock lock(_eventlock);
//----------------------------------
for(size_t i = 0; i < _evRebuilt.size(); ++i)
if(_evRebuilt[i] == ev)
return;
Expand All @@ -403,6 +410,7 @@ void MxSources::addListener(EvTreeRebuilt* ev)
void MxSources::removeListener(EvTreeRebuilt* ev)
{
std::unique_lock lock(_eventlock);
//----------------------------------
_evRebuilt.erase(std::remove(_evRebuilt.begin(), _evRebuilt.end(), ev));
}

Expand Down
3 changes: 2 additions & 1 deletion src/maiden/mxsources.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ class MxSources
std::thread _th;
std::atomic<bool> _quit;
std::condition_variable _waiter;
std::mutex _waitlock, _eventlock;
std::mutex _waitlock;
mutable std::mutex _eventlock;
std::vector<std::string> _envStrings;
std::vector<const char*> _envPtrs;
std::vector<EvTreeRebuilt*> _evRebuilt;
Expand Down
1 change: 0 additions & 1 deletion src/maiden/mxstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include "rng.h"
#include "tomcrypt.h"
#include "scopetimer.h"
#include "fts_fuzzy_match.h"
#include <future>
#include "strmatch.h"

Expand Down

0 comments on commit 8d66a7c

Please sign in to comment.