Merge pull request #526 from openzim/anchor_suggestion_search

openzim · Apr 7, 2021 · 4e22ac3 · 4e22ac3
2 parents 3bc873b + b166aca
commit 4e22ac3
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 11 deletions.
diff --git a/src/constants.h b/src/constants.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2021 Maneesh P M <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#define ANCHOR_TERM "0posanchor "
diff --git a/src/search.cpp b/src/search.cpp
@@ -38,6 +38,8 @@
 #include "xapian.h"
 #include <unicode/locid.h>
 
+#include "constants.h"
+
 #define MAX_MATCHES_TO_SORT 10000
 
 namespace zim
@@ -118,21 +120,34 @@ setup_queryParser(Xapian::QueryParser* queryParser,
 /*
  * subquery_phrase: selects documents that have the terms in the order of the query
  * within a specified window.
+ * subquery_anchored: selects documents that have the terms in the order of the
+ * query within a specified window and starts from the beginning of the document.
  * subquery_and: selects documents that have all the terms in the query.
- * subquery_phrase by itself is quite exclusive. To include more "similar" docs,
- * we combine it with subquery_and using OP_OR operator. If a perticular document
- * has a weight of A in subquery_phrase and B in subquery_and, the net weight of
- * that document becomes A+B. So the documents closer to the query gets a higher.
+ *
+ * subquery_phrase and subquery_anchored by themselves are quite exclusive. To
+ * include more "similar" docs, we combine them with subquery_and using OP_OR
+ * operator. If a particular document has a weight of A in subquery_and and B
+ * in subquery_phrase and C in subquery_anchored, the net weight of that document
+ * becomes A+B+C (normalised out of 100). So the documents closer to the query
+ * gets a higher relevance.
  */
 Xapian::Query parse_query(Xapian::QueryParser* query_parser, std::string qs, int flags, std::string prefix, bool suggestion_mode) {
     Xapian::Query query, subquery_and;
     query = subquery_and = query_parser->parse_query(qs, flags, prefix);
 
-    if (suggestion_mode) {
+    if (suggestion_mode && !query.empty()) {
+      Xapian::Query subquery_phrase, subquery_anchored;
       query_parser->set_default_op(Xapian::Query::op::OP_PHRASE);
-      Xapian::Query subquery_phrase = query_parser->parse_query(qs);
+
+      subquery_phrase = query_parser->parse_query(qs);
       subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length());
-      query = Xapian::Query(Xapian::Query::OP_OR, subquery_phrase, subquery_and);
+
+      qs = ANCHOR_TERM + qs;
+      subquery_anchored = query_parser->parse_query(qs);
+      subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length());
+
+      query = Xapian::Query(Xapian::Query::OP_OR, query, subquery_phrase);
+      query = Xapian::Query(Xapian::Query::OP_OR, query, subquery_anchored);
     }
 
     return query;

diff --git a/src/writer/xapianIndexer.cpp b/src/writer/xapianIndexer.cpp
@@ -21,6 +21,7 @@
 #include "libzim-resources.h"
 #include "fs.h"
 #include "tools.h"
+#include "../constants.h"
 #include <sstream>
 #include <fstream>
 #include <stdexcept>
@@ -132,7 +133,8 @@ void XapianIndexer::indexTitle(const std::string& path, const std::string& title
   }
 
   if (!unaccentedTitle.empty()) {
-    indexer.index_text(unaccentedTitle, 1);
+    std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle;
+    indexer.index_text(anchoredTitle, 1);
   }
 
   /* add to the database */

diff --git a/test/suggestion.cpp b/test/suggestion.cpp
@@ -139,9 +139,9 @@ namespace {
     std::vector<std::string> resultSet = getSuggestions(archive, "berlin", archive.getEntryCount());
     std::vector<std::string> expectedResult = {
                                                 "berlin",
+                                                "berlin wall",
                                                 "hotel berlin, berlin",
                                                 "again berlin",
-                                                "berlin wall",
                                                 "not berlin"
                                               };
 
@@ -252,11 +252,11 @@ namespace {
     resultSet = getSuggestions(archive, "the", archive.getEntryCount());
     expectedResult = {
                        "The chocolate factory",
-                       "Hour of the wolf",
                        "The wolf among sheeps",
                        "The wolf of Shingashina",
                        "The wolf of Wall Street",
                        "The wolf of Wall Street Book",
+                       "Hour of the wolf",
                        "Terma termb the wolf of wall street termc"
                      };
 
@@ -265,11 +265,11 @@ namespace {
     // "the wolf"
     resultSet = getSuggestions(archive, "the wolf", archive.getEntryCount());
     expectedResult = {
-                       "Hour of the wolf",
                        "The wolf among sheeps",
                        "The wolf of Shingashina",
                        "The wolf of Wall Street",
                        "The wolf of Wall Street Book",
+                       "Hour of the wolf",
                        "Terma termb the wolf of wall street termc"
                      };
 
@@ -417,4 +417,25 @@ namespace {
                                               };
     ASSERT_EQ(resultSet, expectedResult);
   }
+
+  // Titles which begins with the search string should have higher relevance
+  TEST(Suggestion, anchorQueryToBeginning) {
+    std::vector<std::string> titles = {
+                                        "aterm bterm this is a title cterm",
+                                        "this is a title aterm bterm cterm",
+                                        "aterm this is a title bterm cterm"
+                                      };
+
+    TempZimArchive tza("testZim");
+    const zim::Archive archive = tza.createZimFromTitles(titles);
+
+    std::vector<std::string> resultSet = getSuggestions(archive, "This is a title", archive.getEntryCount());
+    std::vector<std::string> expectedResult = {
+                                                "this is a title aterm bterm cterm",
+                                                "aterm bterm this is a title cterm",
+                                                "aterm this is a title bterm cterm"
+                                              };
+
+    ASSERT_EQ(expectedResult, resultSet);
+  }
 }