Skip to content

Commit

Permalink
Move query parsing in internalDataBase.
Browse files Browse the repository at this point in the history
Parsing the query can be made entirely in the database, so let's move
it there.
  • Loading branch information
mgautierfr committed Apr 19, 2021
1 parent 17345ea commit 29fddde
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 80 deletions.
141 changes: 64 additions & 77 deletions src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,59 +77,25 @@ std::map<std::string, int> read_valuesmap(const std::string &s) {
return result;
}

/*
* subquery_phrase: selects documents that have the terms in the order of the query
* within a specified window.
* subquery_anchored: selects documents that have the terms in the order of the
* query within a specified window and starts from the beginning of the document.
* subquery_and: selects documents that have all the terms in the query.
*
* subquery_phrase and subquery_anchored by themselves are quite exclusive. To
* include more "similar" docs, we combine them with subquery_and using OP_OR
* operator. If a particular document has a weight of A in subquery_and and B
* in subquery_phrase and C in subquery_anchored, the net weight of that document
* becomes A+B+C (normalised out of 100). So the documents closer to the query
* gets a higher relevance.
*/
Xapian::Query parse_query(Xapian::QueryParser* query_parser, std::string qs, int flags, std::string prefix, bool suggestion_mode) {
Xapian::Query query, subquery_and;
query = subquery_and = query_parser->parse_query(qs, flags, prefix);

if (suggestion_mode && !query.empty()) {
Xapian::Query subquery_phrase, subquery_anchored;
query_parser->set_default_op(Xapian::Query::op::OP_OR);
query_parser->set_stemming_strategy(Xapian::QueryParser::STEM_NONE);

subquery_phrase = query_parser->parse_query(qs);
subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length());

qs = ANCHOR_TERM + qs;
subquery_anchored = query_parser->parse_query(qs);
subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length());

query = Xapian::Query(Xapian::Query::OP_OR, query, subquery_phrase);
query = Xapian::Query(Xapian::Query::OP_OR, query, subquery_anchored);
}

return query;
}

} // end of anonymous namespace


InternalDataBase::InternalDataBase(const std::vector<Archive>& archives, bool suggestionMode)
: m_hasNewSuggestionFormat(false)
{
bool first = true;
bool hasNewSuggestionFormat = false;
m_queryParser.set_database(m_database);
m_queryParser.set_default_op(Xapian::Query::op::OP_AND);
m_flags = Xapian::QueryParser::FLAG_DEFAULT;
if (suggestionMode) {
m_flags |= Xapian::QueryParser::FLAG_PARTIAL;
}
for(auto& archive: archives) {
auto impl = archive.getImpl();
FileImpl::FindxResult r;
if (suggestionMode) {
r = impl->findx('X', "title/xapian");
if (r.first) {
m_hasNewSuggestionFormat = true;
hasNewSuggestionFormat = true;
}
}
if (!r.first) {
Expand Down Expand Up @@ -161,6 +127,7 @@ InternalDataBase::InternalDataBase(const std::vector<Archive>& archives, bool su
std::cerr << "dbOffest = " << accessInfo.second << std::endl;
continue;
}

Xapian::Database database;
try {
database = Xapian::Database(databasefd.release());
Expand Down Expand Up @@ -192,7 +159,7 @@ InternalDataBase::InternalDataBase(const std::vector<Archive>& archives, bool su
Xapian::Stem stemmer = Xapian::Stem(languageLocale.getLanguage());
m_queryParser.set_stemmer(stemmer);
m_queryParser.set_stemming_strategy(
m_hasNewSuggestionFormat ? Xapian::QueryParser::STEM_SOME : Xapian::QueryParser::STEM_ALL);
hasNewSuggestionFormat ? Xapian::QueryParser::STEM_SOME : Xapian::QueryParser::STEM_ALL);
} catch (...) {
std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
}
Expand All @@ -208,7 +175,10 @@ InternalDataBase::InternalDataBase(const std::vector<Archive>& archives, bool su
stopper->release();
m_queryParser.set_stopper(stopper);
}
m_prefixes = database.get_metadata("prefixes");
auto prefixes = database.get_metadata("prefixes");
if ( hasNewSuggestionFormat && prefixes.find("S") != std::string::npos ) {
m_prefix = "S";
}
} else {
std::map<std::string, int> valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
if (m_valuesmap != valuesmap ) {
Expand Down Expand Up @@ -242,9 +212,55 @@ int InternalDataBase::valueSlot(const std::string& valueName) const
return m_valuesmap.at(valueName);
}

Xapian::QueryParser& InternalDataBase::getQueryParser()
/*
* subquery_phrase: selects documents that have the terms in the order of the query
* within a specified window.
* subquery_anchored: selects documents that have the terms in the order of the
* query within a specified window and starts from the beginning of the document.
* subquery_and: selects documents that have all the terms in the query.
*
* subquery_phrase and subquery_anchored by themselves are quite exclusive. To
* include more "similar" docs, we combine them with subquery_and using OP_OR
* operator. If a particular document has a weight of A in subquery_and and B
* in subquery_phrase and C in subquery_anchored, the net weight of that document
* becomes A+B+C (normalised out of 100). So the documents closer to the query
* gets a higher relevance.
*/
Xapian::Query InternalDataBase::parseQuery(const Query& query)
{
return m_queryParser;
Xapian::Query xquery;

xquery = m_queryParser.parse_query(query.m_query, m_flags, m_prefix);

if (query.m_suggestionMode && !query.m_query.empty()) {
Xapian::QueryParser suggestionParser = m_queryParser;
suggestionParser.set_default_op(Xapian::Query::op::OP_OR);
suggestionParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
Xapian::Query subquery_phrase = suggestionParser.parse_query(query.m_query);
// Force the OP_PHRASE window to be equal to the number of terms.
subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length());

auto qs = ANCHOR_TERM + query.m_query;
Xapian::Query subquery_anchored = suggestionParser.parse_query(qs);
subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length());

xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_phrase);
xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_anchored);
}

if (query.m_geoquery && hasValue("geo.position")) {
Xapian::GreatCircleMetric metric;
Xapian::LatLongCoord centre(query.m_latitude, query.m_longitude);
Xapian::LatLongDistancePostingSource ps(valueSlot("geo.position"), centre, metric, query.m_distance);
Xapian::Query geoQuery(&ps);
if (query.m_query.empty()) {
xquery = geoQuery;
} else {
xquery = Xapian::Query(Xapian::Query::OP_FILTER, xquery, geoQuery);
}
}

return xquery;
}

Searcher::Searcher(const std::vector<Archive>& archives) :
Expand Down Expand Up @@ -356,40 +372,13 @@ Xapian::Enquire& Search::getEnquire() const
return *mp_enquire;
}

auto queryParser = mp_internalDb->getQueryParser();
std::string prefix = "";
unsigned flags = Xapian::QueryParser::FLAG_DEFAULT;
if (m_query.m_suggestionMode) {
if (m_query.m_verbose) {
std::cout << "Mark query as 'partial'" << std::endl;
}
flags |= Xapian::QueryParser::FLAG_PARTIAL;
if ( !mp_internalDb->m_hasNewSuggestionFormat
&& mp_internalDb->m_prefixes.find("S") != std::string::npos ) {
if (m_query.m_verbose) {
std::cout << "Searching in title namespace" << std::endl;
}
prefix = "S";
}
}
auto query = parse_query(queryParser, m_query.m_query, flags, prefix, m_query.m_suggestionMode);
if (m_query.m_verbose) {
std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl;
}
delete queryParser;

auto enquire = std::unique_ptr<Xapian::Enquire>(new Xapian::Enquire(mp_internalDb->m_database));

if (m_query.m_geoquery && mp_internalDb->hasValue("geo.position")) {
Xapian::GreatCircleMetric metric;
Xapian::LatLongCoord centre(m_query.m_latitude, m_query.m_longitude);
Xapian::LatLongDistancePostingSource ps(mp_internalDb->valueSlot("geo.position"), centre, metric, m_query.m_distance);
if ( m_query.m_query.empty()) {
query = Xapian::Query(&ps);
} else {
query = Xapian::Query(Xapian::Query::OP_FILTER, query, Xapian::Query(&ps));
}
auto query = mp_internalDb->parseQuery(m_query);
if (m_query.m_verbose) {
std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl;
}
enquire->set_query(query);

/*
* In suggestion mode, we are searching over a separate title index. Default BM25 is not
Expand All @@ -407,8 +396,6 @@ Xapian::Enquire& Search::getEnquire() const
}
}

enquire->set_query(query);


if (m_query.m_suggestionMode && mp_internalDb->hasValue("targetPath")) {
enquire->set_collapse_key(mp_internalDb->valueSlot("targetPath"));
Expand Down
6 changes: 3 additions & 3 deletions src/search_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ class InternalDataBase {
bool hasValue(const std::string& valueName) const;
int valueSlot(const std::string& valueName) const;

Xapian::QueryParser& getQueryParser();
Xapian::Query parseQuery(const Query& query);

public: // data
Xapian::Database m_database; // The (main) database we will search on (wrapping other xapian databases).
std::vector<Xapian::Database> m_xapianDatabases; // The real databases.
std::vector<Archive> m_archives; // The archives we are searching on.
std::map<std::string, int> m_valuesmap; // The valuesmap associated with the database.
std::string m_prefixes; // The prefix stored in the database.
bool m_hasNewSuggestionFormat; // If the database has new suggestion format.
std::string m_prefix; // The prefix to search on.
unsigned m_flags; // Flags to use to parse the query.
Xapian::QueryParser m_queryParser;
};

Expand Down

0 comments on commit 29fddde

Please sign in to comment.