Skip to content

Commit

Permalink
Merge pull request #873 from openzim/xapian_exception
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr authored Apr 16, 2024
2 parents a34c21c + 75c47ab commit 8af2881
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 123 deletions.
111 changes: 67 additions & 44 deletions src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
*
*/

#include <zim/error.h>
#include <zim/search.h>
#include <zim/archive.h>
#include <zim/item.h>
Expand Down Expand Up @@ -77,51 +78,57 @@ InternalDataBase::InternalDataBase(const std::vector<Archive>& archives, bool ve
continue;
}

if ( first ) {
m_valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
auto language = database.get_metadata("language");
if (language.empty() ) {
// Database created before 2017/03 has no language metadata.
// However, term were stemmed anyway and we need to stem our
// search query the same the database was created.
// So we need a language, let's use the one of the zim.
// If zimfile has no language metadata, we can't do lot more here :/
try {
language = archive.getMetadata("Language");
} catch(...) {}
}
if (!language.empty()) {
icu::Locale languageLocale(language.c_str());
/* Configuring language base steemming */
try {
m_stemmer = Xapian::Stem(languageLocale.getLanguage());
m_queryParser.set_stemmer(m_stemmer);
m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
} catch (...) {
std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
try {
if ( first ) {
m_valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
auto language = database.get_metadata("language");
if (language.empty() ) {
// Database created before 2017/03 has no language metadata.
// However, term were stemmed anyway and we need to stem our
// search query the same the database was created.
// So we need a language, let's use the one of the zim.
// If zimfile has no language metadata, we can't do lot more here :/
try {
language = archive.getMetadata("Language");
} catch(...) {}
}
}
auto stopwords = database.get_metadata("stopwords");
if ( !stopwords.empty() ){
std::string stopWord;
std::istringstream file(stopwords);
Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper();
while (std::getline(file, stopWord, '\n')) {
stopper->add(stopWord);
if (!language.empty()) {
icu::Locale languageLocale(language.c_str());
/* Configuring language base steemming */
try {
m_stemmer = Xapian::Stem(languageLocale.getLanguage());
m_queryParser.set_stemmer(m_stemmer);
m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
} catch (...) {
std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
}
}
auto stopwords = database.get_metadata("stopwords");
if ( !stopwords.empty() ){
std::string stopWord;
std::istringstream file(stopwords);
Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper();
while (std::getline(file, stopWord, '\n')) {
stopper->add(stopWord);
}
stopper->release();
m_queryParser.set_stopper(stopper);
}
} else {
std::map<std::string, int> valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
if (m_valuesmap != valuesmap ) {
// [TODO] Ignore the database, raise a error ?
}
stopper->release();
m_queryParser.set_stopper(stopper);
}
} else {
std::map<std::string, int> valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
if (m_valuesmap != valuesmap ) {
// [TODO] Ignore the database, raise a error ?
}
m_xapianDatabases.push_back(database);
m_database.add_database(database);
m_archives.push_back(archive);
first = false;
} catch( Xapian::DatabaseError& e ) {
// [TODO] Ignore the database or raise a error ?
// As we already ignore the database if `getDbFromAccessInfo` "detects" a DatabaseError,
// we also ignore here.
}
m_xapianDatabases.push_back(database);
m_database.add_database(database);
m_archives.push_back(archive);
first = false;
}
}

Expand Down Expand Up @@ -278,6 +285,8 @@ int Search::getEstimatedMatches() const
return mset.get_matches_estimated();
} catch(Xapian::QueryParserError& e) {
return 0;
} catch(Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

Expand All @@ -288,6 +297,8 @@ const SearchResultSet Search::getResults(int start, int maxResults) const {
return SearchResultSet(mp_internalDb, std::move(mset));
} catch(Xapian::QueryParserError& e) {
return SearchResultSet(mp_internalDb);
} catch(Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

Expand Down Expand Up @@ -325,23 +336,35 @@ int SearchResultSet::size() const
if (! mp_mset) {
return 0;
}
return mp_mset->size();
try {
return mp_mset->size();
} catch(Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

SearchResultSet::iterator SearchResultSet::begin() const
{
if ( ! mp_mset ) {
return nullptr;
}
return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin());
try {
return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin());
} catch(Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

SearchResultSet::iterator SearchResultSet::end() const
{
if ( ! mp_mset ) {
return nullptr;
}
return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end());
try {
return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end());
} catch(Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

} //namespace zim
34 changes: 23 additions & 11 deletions src/search_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,25 +108,37 @@ struct SearchIterator::InternalData {
{};

Xapian::Document get_document() {
if ( !document_fetched ) {
_document = iterator().get_document();
document_fetched = true;
try {
if ( !document_fetched ) {
_document = iterator().get_document();
document_fetched = true;
}
return _document;
} catch ( Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
return _document;
}

int get_databasenumber() {
Xapian::docid docid = *iterator();
return (docid - 1) % mp_internalDb->m_archives.size();
try {
Xapian::docid docid = *iterator();
return (docid - 1) % mp_internalDb->m_archives.size();
} catch ( Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

Entry& get_entry() {
if ( !_entry ) {
int databasenumber = get_databasenumber();
auto archive = mp_internalDb->m_archives.at(databasenumber);
_entry.reset(new Entry(archive.getEntryByPath(get_document().get_data())));
}
try {
if ( !_entry ) {
int databasenumber = get_databasenumber();
auto archive = mp_internalDb->m_archives.at(databasenumber);
_entry.reset(new Entry(archive.getEntryByPath(get_document().get_data())));
}
return *_entry.get();
} catch ( Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

bool operator==(const InternalData& other) const {
Expand Down
119 changes: 68 additions & 51 deletions src/search_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
*
*/

#include <zim/error.h>
#define ZIM_PRIVATE

#include "xapian/myhtmlparse.h"
Expand Down Expand Up @@ -106,20 +107,24 @@ std::string SearchIterator::getPath() const {
return "";
}

std::string path = internal->get_document().get_data();
bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme();
try {
std::string path = internal->get_document().get_data();
bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme();

std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data");
if (dbDataType.empty()) {
dbDataType = "fullPath";
}
std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data");
if (dbDataType.empty()) {
dbDataType = "fullPath";
}

// If the archive has new namespace scheme and the type of its indexed data
// is `fullPath` we return only the `path` without namespace
if (hasNewNamespaceScheme && dbDataType == "fullPath") {
path = path.substr(2);
// If the archive has new namespace scheme and the type of its indexed data
// is `fullPath` we return only the `path` without namespace
if (hasNewNamespaceScheme && dbDataType == "fullPath") {
path = path.substr(2);
}
return path;
} catch (Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
return path;
}

std::string SearchIterator::getDbData() const {
Expand Down Expand Up @@ -149,58 +154,66 @@ std::string SearchIterator::getSnippet() const {
return "";
}

// Generate full text snippet
if ( ! internal->mp_internalDb->hasValuesmap() )
{
/* This is the old legacy version. Guess and try */
std::string stored_snippet = internal->get_document().get_value(1);
if ( ! stored_snippet.empty() )
return stored_snippet;
/* Let's continue here, and see if we can genenate one */
}
else if ( internal->mp_internalDb->hasValue("snippet") )
{
return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet"));
}

Entry& entry = internal->get_entry();
/* No reader, no snippet */
try {
/* Get the content of the item to generate a snippet.
We parse it and use the html dump to avoid remove html tags in the
content and be able to nicely cut the text at random place. */
zim::MyHtmlParser htmlParser;
std::string content = entry.getItem().getData();
// Generate full text snippet
if ( ! internal->mp_internalDb->hasValuesmap() )
{
/* This is the old legacy version. Guess and try */
std::string stored_snippet = internal->get_document().get_value(1);
if ( ! stored_snippet.empty() )
return stored_snippet;
/* Let's continue here, and see if we can genenate one */
}
else if ( internal->mp_internalDb->hasValue("snippet") )
{
return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet"));
}

Entry& entry = internal->get_entry();
/* No reader, no snippet */
try {
htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {}
return internal->mp_mset->snippet(htmlParser.dump,
/*length=*/500,
/*stemmer=*/internal->mp_internalDb->m_stemmer,
/*flags=*/0);
} catch (...) {
return "";
/* Get the content of the item to generate a snippet.
We parse it and use the html dump to avoid remove html tags in the
content and be able to nicely cut the text at random place. */
zim::MyHtmlParser htmlParser;
std::string content = entry.getItem().getData();
try {
htmlParser.parse_html(content, "UTF-8", true);
} catch (...) {}
return internal->mp_mset->snippet(htmlParser.dump,
/*length=*/500,
/*stemmer=*/internal->mp_internalDb->m_stemmer,
/*flags=*/0);
} catch (...) {
return "";
}
} catch (Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

int SearchIterator::getSize() const {
return -1;
}

int SearchIterator::getWordCount() const {
int SearchIterator::getWordCount() const {
if ( ! internal ) {
return -1;
}
if ( ! internal->mp_internalDb->hasValuesmap() )
{
/* This is the old legacy version. Guess and try */
return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str());
}
else if ( internal->mp_internalDb->hasValue("wordcount") )
{
return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str());
try {
if ( ! internal->mp_internalDb->hasValuesmap() )
{
/* This is the old legacy version. Guess and try */
return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str());
}
else if ( internal->mp_internalDb->hasValue("wordcount") )
{
return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str());
}
return -1;
} catch (Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
return -1;
}

int SearchIterator::getFileIndex() const {
Expand All @@ -214,7 +227,11 @@ Uuid SearchIterator::getZimId() const {
if (! internal ) {
throw std::runtime_error("Cannot get zimId from uninitialized iterator");
}
return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid();
try {
return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid();
} catch (Xapian::DatabaseError& e) {
throw zim::ZimFileFormatError(e.get_description());
}
}

SearchIterator::reference SearchIterator::operator*() const {
Expand Down
Loading

0 comments on commit 8af2881

Please sign in to comment.