From 5ad6f89f88e54dcfc963cb7d8d6a1c41528d8302 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Tue, 6 Apr 2021 20:36:27 +0200 Subject: [PATCH] Move query parsing in internalDataBase. Parsing the query can be made entirely in the database, so let's move it there. --- src/search.cpp | 118 ++++++++++++++++++------------------------ src/search_internal.h | 6 +-- 2 files changed, 54 insertions(+), 70 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index dcaf718c2..ba256ac78 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -74,48 +74,24 @@ std::map read_valuesmap(const std::string &s) { } return result; } - - - - -/* - * subquery_phrase: selects documents that have the terms in the order of the query - * within a specified window. - * subquery_and: selects documents that have all the terms in the query. - * subquery_phrase by itself is quite exclusive. To include more "similar" docs, - * we combine it with subquery_and using OP_OR operator. If a perticular document - * has a weight of A in subquery_phrase and B in subquery_and, the net weight of - * that document becomes A+B. So the documents closer to the query gets a higher. - */ -Xapian::Query parse_query(Xapian::QueryParser* query_parser, std::string qs, int flags, std::string prefix, bool suggestion_mode) { - Xapian::Query query, subquery_and; - query = subquery_and = query_parser->parse_query(qs, flags, prefix); - - if (suggestion_mode) { - query_parser->set_default_op(Xapian::Query::op::OP_PHRASE); - Xapian::Query subquery_phrase = query_parser->parse_query(qs); - subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length()); - query = Xapian::Query(Xapian::Query::OP_OR, subquery_phrase, subquery_and); - } - - return query; -} - } InternalDataBase::InternalDataBase(const std::vector& archives, bool suggestionMode) - : m_hasNewSuggestionFormat(false) { bool first = true; + bool hasNewSuggestionFormat = false; m_queryParser.set_database(m_database); - m_queryParser.set_default_op(Xapian::Query::op::OP_AND); + m_flags = Xapian::QueryParser::FLAG_DEFAULT; + if (suggestionMode) { + m_flags |= Xapian::QueryParser::FLAG_PARTIAL; + } for(auto& archive: archives) { auto impl = archive.getImpl(); FileImpl::FindxResult r; if (suggestionMode) { r = impl->findx('X', "title/xapian"); if (r.first) { - m_hasNewSuggestionFormat = true; + hasNewSuggestionFormat = true; } } if (!r.first) { @@ -178,7 +154,7 @@ InternalDataBase::InternalDataBase(const std::vector& archives, bool su Xapian::Stem stemmer = Xapian::Stem(languageLocale.getLanguage()); m_queryParser.set_stemmer(stemmer); m_queryParser.set_stemming_strategy( - m_hasNewSuggestionFormat ? Xapian::QueryParser::STEM_SOME : Xapian::QueryParser::STEM_ALL); + hasNewSuggestionFormat ? Xapian::QueryParser::STEM_SOME : Xapian::QueryParser::STEM_ALL); } catch (...) { std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } @@ -194,7 +170,10 @@ InternalDataBase::InternalDataBase(const std::vector& archives, bool su stopper->release(); m_queryParser.set_stopper(stopper); } - m_prefixes = database.get_metadata("prefixes"); + auto prefixes = database.get_metadata("prefixes"); + if ( hasNewSuggestionFormat && prefixes.find("S") != std::string::npos ) { + m_prefix = "S"; + } } else { std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); if (m_valuesmap != valuesmap ) { @@ -228,9 +207,43 @@ int InternalDataBase::valueSlot(const std::string& valueName) const return m_valuesmap.at(valueName); } -Xapian::QueryParser& InternalDataBase::getQueryParser() +/* + * subquery_phrase: selects documents that have the terms in the order of the query + * within a specified window. + * subquery_and: selects documents that have all the terms in the query. + * subquery_phrase by itself is quite exclusive. To include more "similar" docs, + * we combine it with subquery_and using OP_OR operator. If a perticular document + * has a weight of A in subquery_phrase and B in subquery_and, the net weight of + * that document becomes A+B. So the documents closer to the query gets a higher. + */ +Xapian::Query InternalDataBase::parseQuery(const Query& query) { - return m_queryParser; + Xapian::Query xquery; + + m_queryParser.set_default_op(Xapian::Query::op::OP_AND); + xquery = m_queryParser.parse_query(query.m_query, m_flags, m_prefix); + + if (query.m_suggestionMode) { + m_queryParser.set_default_op(Xapian::Query::op::OP_PHRASE); + Xapian::Query subquery_phrase = m_queryParser.parse_query(query.m_query); + // Force the OP_PHRASE window to be equal to the number of terms. + subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length()); + xquery = Xapian::Query(Xapian::Query::OP_OR, subquery_phrase, xquery); + } + + if (query.m_geoquery && hasValue("geo.position")) { + Xapian::GreatCircleMetric metric; + Xapian::LatLongCoord centre(query.m_latitude, query.m_longitude); + Xapian::LatLongDistancePostingSource ps(valueSlot("geo.position"), centre, metric, query.m_distance); + Xapian::Query geoQuery(&ps); + if (query.m_query.empty()) { + xquery = geoQuery; + } else { + xquery = Xapian::Query(Xapian::Query::OP_FILTER, xquery, geoQuery); + } + } + + return xquery; } Searcher::Searcher(const std::vector& archives) : @@ -340,40 +353,13 @@ Xapian::Enquire& Search::getEnquire() const return *mp_enquire; } - auto queryParser = mp_internalDb->getQueryParser(); - std::string prefix = ""; - unsigned flags = Xapian::QueryParser::FLAG_DEFAULT; - if (m_query.m_suggestionMode) { - if (m_query.m_verbose) { - std::cout << "Mark query as 'partial'" << std::endl; - } - flags |= Xapian::QueryParser::FLAG_PARTIAL; - if ( !mp_internalDb->m_hasNewSuggestionFormat - && mp_internalDb->m_prefixes.find("S") != std::string::npos ) { - if (m_query.m_verbose) { - std::cout << "Searching in title namespace" << std::endl; - } - prefix = "S"; - } - } - auto query = parse_query(queryParser, m_query.m_query, flags, prefix, m_query.m_suggestionMode); - if (m_query.m_verbose) { - std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl; - } - delete queryParser; - auto enquire = std::unique_ptr(new Xapian::Enquire(mp_internalDb->m_database)); - if (m_query.m_geoquery && mp_internalDb->hasValue("geo.position")) { - Xapian::GreatCircleMetric metric; - Xapian::LatLongCoord centre(m_query.m_latitude, m_query.m_longitude); - Xapian::LatLongDistancePostingSource ps(mp_internalDb->valueSlot("geo.position"), centre, metric, m_query.m_distance); - if ( m_query.m_query.empty()) { - query = Xapian::Query(&ps); - } else { - query = Xapian::Query(Xapian::Query::OP_FILTER, query, Xapian::Query(&ps)); - } + auto query = mp_internalDb->parseQuery(m_query); + if (m_query.m_verbose) { + std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl; } + enquire->set_query(query); /* * In suggestion mode, we are searching over a separate title index. Default BM25 is not @@ -391,8 +377,6 @@ Xapian::Enquire& Search::getEnquire() const } } - enquire->set_query(query); - if (m_query.m_suggestionMode && mp_internalDb->hasValue("targetPath")) { enquire->set_collapse_key(mp_internalDb->valueSlot("targetPath")); diff --git a/src/search_internal.h b/src/search_internal.h index a0f06bb07..34e5edb77 100644 --- a/src/search_internal.h +++ b/src/search_internal.h @@ -37,15 +37,15 @@ class InternalDataBase { bool hasValue(const std::string& valueName) const; int valueSlot(const std::string& valueName) const; - Xapian::QueryParser& getQueryParser(); + Xapian::Query parseQuery(const Query& query); public: // data Xapian::Database m_database; // The (main) database we will search on (wrapping other xapian databases). std::vector m_xapianDatabases; // The real databases. std::vector m_archives; // The archives we are searching on. std::map m_valuesmap; // The valuesmap associated with the database. - std::string m_prefixes; // The prefix stored in the database. - bool m_hasNewSuggestionFormat; // If the database has new suggestion format. + std::string m_prefix; // The prefix to search on. + unsigned m_flags; // Flags to use to parse the query. Xapian::QueryParser m_queryParser; };