Skip to content

Commit

Permalink
Remove stopwords title indexer, update unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
maneeshpm committed Mar 4, 2021
1 parent af4bb17 commit 311e598
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 16 deletions.
2 changes: 0 additions & 2 deletions src/writer/xapianIndexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ void XapianIndexer::indexTitle(const std::string& path, const std::string& title
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
indexer.set_stopper(&stopper);
indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL);
Xapian::Document currentDocument;
currentDocument.clear_values();
currentDocument.set_data(path);
Expand Down
38 changes: 24 additions & 14 deletions test/suggestion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,11 +134,11 @@ namespace {

std::vector<std::string> resultSet = getSuggestions(archive, "berlin", archive.getEntryCount());
std::vector<std::string> expectedResult = {
"again berlin",
"berlin",
"not berlin",
"hotel berlin, berlin",
"berlin wall"
"again berlin",
"berlin wall",
"not berlin"
};

ASSERT_EQ(expectedResult , resultSet);
Expand Down Expand Up @@ -244,16 +244,23 @@ namespace {

ASSERT_EQ(expectedResult, resultSet);

// "the" which is a stopword
// "the"
resultSet = getSuggestions(archive, "the", archive.getEntryCount());
expectedResult = {};
expectedResult = {
"The chocolate factory",
"Hour of the wolf",
"The wolf among sheeps",
"The wolf of Shingashina",
"The wolf of Wall Street",
"The wolf of Wall Street Book",
"Terma termb the wolf of wall street termc"
};

ASSERT_EQ(expectedResult, resultSet);

// "the wolf" translates to "wolf"
// "the wolf"
resultSet = getSuggestions(archive, "the wolf", archive.getEntryCount());
expectedResult = {
"Wolf",
"Hour of the wolf",
"The wolf among sheeps",
"The wolf of Shingashina",
Expand All @@ -265,15 +272,18 @@ namespace {
ASSERT_EQ(expectedResult, resultSet);

// "the wolf of"
// this gives an empty result since the word "of" being a stopword is not
// included in the index, but being a partial search, it must be inluced as
// a `WILDCARD SYNONYM of` which is not present in the index.
resultSet = getSuggestions(archive, "the wolf of", archive.getEntryCount());
expectedResult = {};
expectedResult = {
"Hour of the wolf",
"The wolf of Shingashina",
"The wolf of Wall Street",
"The wolf of Wall Street Book",
"Terma termb the wolf of wall street termc"
};

ASSERT_EQ(expectedResult, resultSet);

// "the wolf of wall" translates to "wolf wall"
// "the wolf of wall"
resultSet = getSuggestions(archive, "the wolf of wall", archive.getEntryCount());
expectedResult = {
"The wolf of Wall Street",
Expand Down Expand Up @@ -315,10 +325,10 @@ namespace {
TempZimArchive tza("testZim");
const zim::Archive archive = tza.createZimFromTitles(titles);

// "she", "and", "the" are stopwords, hence the query is resolved to just "apples"
// "she", "and", "the" are stopwords, If stopwords are properly handled, they
// should be included in the result documents.
std::vector<std::string> resultSet = getSuggestions(archive, "she and the apple", archive.getEntryCount());
std::vector<std::string> expectedResult = {
"apple",
"she and the apple",
};
ASSERT_EQ(expectedResult, resultSet);
Expand Down

0 comments on commit 311e598

Please sign in to comment.