Skip to content

Commit

Permalink
Merge pull request #526 from openzim/anchor_suggestion_search
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr authored Apr 7, 2021
2 parents 3bc873b + b166aca commit 4e22ac3
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 11 deletions.
20 changes: 20 additions & 0 deletions src/constants.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/*
* Copyright (C) 2021 Maneesh P M <[email protected]>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/

#define ANCHOR_TERM "0posanchor "
29 changes: 22 additions & 7 deletions src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
#include "xapian.h"
#include <unicode/locid.h>

#include "constants.h"

#define MAX_MATCHES_TO_SORT 10000

namespace zim
Expand Down Expand Up @@ -118,21 +120,34 @@ setup_queryParser(Xapian::QueryParser* queryParser,
/*
* subquery_phrase: selects documents that have the terms in the order of the query
* within a specified window.
* subquery_anchored: selects documents that have the terms in the order of the
* query within a specified window and starts from the beginning of the document.
* subquery_and: selects documents that have all the terms in the query.
* subquery_phrase by itself is quite exclusive. To include more "similar" docs,
* we combine it with subquery_and using OP_OR operator. If a perticular document
* has a weight of A in subquery_phrase and B in subquery_and, the net weight of
* that document becomes A+B. So the documents closer to the query gets a higher.
*
* subquery_phrase and subquery_anchored by themselves are quite exclusive. To
* include more "similar" docs, we combine them with subquery_and using OP_OR
* operator. If a particular document has a weight of A in subquery_and and B
* in subquery_phrase and C in subquery_anchored, the net weight of that document
* becomes A+B+C (normalised out of 100). So the documents closer to the query
* gets a higher relevance.
*/
Xapian::Query parse_query(Xapian::QueryParser* query_parser, std::string qs, int flags, std::string prefix, bool suggestion_mode) {
Xapian::Query query, subquery_and;
query = subquery_and = query_parser->parse_query(qs, flags, prefix);

if (suggestion_mode) {
if (suggestion_mode && !query.empty()) {
Xapian::Query subquery_phrase, subquery_anchored;
query_parser->set_default_op(Xapian::Query::op::OP_PHRASE);
Xapian::Query subquery_phrase = query_parser->parse_query(qs);

subquery_phrase = query_parser->parse_query(qs);
subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length());
query = Xapian::Query(Xapian::Query::OP_OR, subquery_phrase, subquery_and);

qs = ANCHOR_TERM + qs;
subquery_anchored = query_parser->parse_query(qs);
subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length());

query = Xapian::Query(Xapian::Query::OP_OR, query, subquery_phrase);
query = Xapian::Query(Xapian::Query::OP_OR, query, subquery_anchored);
}

return query;
Expand Down
4 changes: 3 additions & 1 deletion src/writer/xapianIndexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "libzim-resources.h"
#include "fs.h"
#include "tools.h"
#include "../constants.h"
#include <sstream>
#include <fstream>
#include <stdexcept>
Expand Down Expand Up @@ -132,7 +133,8 @@ void XapianIndexer::indexTitle(const std::string& path, const std::string& title
}

if (!unaccentedTitle.empty()) {
indexer.index_text(unaccentedTitle, 1);
std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle;
indexer.index_text(anchoredTitle, 1);
}

/* add to the database */
Expand Down
27 changes: 24 additions & 3 deletions test/suggestion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,9 @@ namespace {
std::vector<std::string> resultSet = getSuggestions(archive, "berlin", archive.getEntryCount());
std::vector<std::string> expectedResult = {
"berlin",
"berlin wall",
"hotel berlin, berlin",
"again berlin",
"berlin wall",
"not berlin"
};

Expand Down Expand Up @@ -252,11 +252,11 @@ namespace {
resultSet = getSuggestions(archive, "the", archive.getEntryCount());
expectedResult = {
"The chocolate factory",
"Hour of the wolf",
"The wolf among sheeps",
"The wolf of Shingashina",
"The wolf of Wall Street",
"The wolf of Wall Street Book",
"Hour of the wolf",
"Terma termb the wolf of wall street termc"
};

Expand All @@ -265,11 +265,11 @@ namespace {
// "the wolf"
resultSet = getSuggestions(archive, "the wolf", archive.getEntryCount());
expectedResult = {
"Hour of the wolf",
"The wolf among sheeps",
"The wolf of Shingashina",
"The wolf of Wall Street",
"The wolf of Wall Street Book",
"Hour of the wolf",
"Terma termb the wolf of wall street termc"
};

Expand Down Expand Up @@ -417,4 +417,25 @@ namespace {
};
ASSERT_EQ(resultSet, expectedResult);
}

// Titles which begins with the search string should have higher relevance
TEST(Suggestion, anchorQueryToBeginning) {
std::vector<std::string> titles = {
"aterm bterm this is a title cterm",
"this is a title aterm bterm cterm",
"aterm this is a title bterm cterm"
};

TempZimArchive tza("testZim");
const zim::Archive archive = tza.createZimFromTitles(titles);

std::vector<std::string> resultSet = getSuggestions(archive, "This is a title", archive.getEntryCount());
std::vector<std::string> expectedResult = {
"this is a title aterm bterm cterm",
"aterm bterm this is a title cterm",
"aterm this is a title bterm cterm"
};

ASSERT_EQ(expectedResult, resultSet);
}
}

0 comments on commit 4e22ac3

Please sign in to comment.