Skip to content

Commit

Permalink
Introduce getEntryByUrl to search for entry using url and fuzzy rules.
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Nov 3, 2023
1 parent 0b74848 commit 0a1ebc1
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 0 deletions.
14 changes: 14 additions & 0 deletions include/zim/archive.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,13 +222,27 @@ namespace zim
*
* Get an entry using its path.
* The path must contains the namespace.
* Path must be exact. Either a entry exists with the path or not.
*
* @param path The entry's path.
* @return The Entry.
* @exception EntryNotFound If no entry has the asked path.
*/
Entry getEntryByPath(const std::string& path) const;

/** Get an entry using a url.
*
* Get an entry using a full url.
* A full url is composed of a path and a querystring (separated by a '?')
* If the path is not in the archive, libzim will try find the entry
* using a combination of the querystring and fuzzyRules stored in the archive.
*
* @param url The url (including querystring) to search for.
* @return The Entry.
* @exception EntryNotFound If no entry has been found.
*/
Entry getEntryByUrl(const std::string& url) const;

/** Get an entry using its "title" index.
*
* Use the index of the entry to get the idx'th entry
Expand Down
18 changes: 18 additions & 0 deletions src/archive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,24 @@ namespace zim
throw EntryNotFound("Cannot find entry");
}

Entry Archive::getEntryByUrl(const std::string& url) const
{
const auto [path, queryParams] = urlSplit(url);
try {
return getEntryByPath(path);
} catch (const EntryNotFound& e) {
for(const auto& path_to_try: m_impl->getFuzzyRules().get_fuzzy_paths(path, queryParams)) {
try {
return getEntryByPath(path_to_try);
} catch (const EntryNotFound& e) {
continue;
}
}
}

throw EntryNotFound("Cannot find entry");
}

Entry Archive::getEntryByTitle(entry_index_type idx) const
{
return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx))));
Expand Down
18 changes: 18 additions & 0 deletions src/fileimpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "envvalue.h"
#include "md5.h"
#include "tools.h"
#include "fuzzy_rules.h"

log_define("zim.file.impl")

Expand Down Expand Up @@ -236,6 +237,7 @@ class Grouping
m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get()));

readMimeTypes();
readFuzzyRules();
}

std::unique_ptr<IndirectDirentAccessor> FileImpl::getTitleAccessor(const std::string& path)
Expand Down Expand Up @@ -365,6 +367,22 @@ class Grouping
}
}

void FileImpl::readFuzzyRules() {
auto r = findx('M', "FuzzyRules");
if (!r.first) {
// No rules
return;
}
auto fuzzy_rule_dirent = getDirent(r.second);
if (fuzzy_rule_dirent->isRedirect()) {
std::cerr << "Error: 'M/FuzzyRules' is a redirect." << std::endl;
return;
}
auto cluster = getCluster(fuzzy_rule_dirent->getClusterNumber());
auto blob = cluster->getBlob(fuzzy_rule_dirent->getBlobNumber());
fuzzyRules = FuzzyRules(blob);
}

FileImpl::FindxResult FileImpl::findx(char ns, const std::string& url)
{
return direntLookup().find(ns, url);
Expand Down
6 changes: 6 additions & 0 deletions src/fileimpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "fileheader.h"
#include "zim_types.h"
#include "direntreader.h"
#include "fuzzy_rules.h"


namespace zim
Expand Down Expand Up @@ -68,6 +69,8 @@ namespace zim
typedef std::vector<std::string> MimeTypes;
MimeTypes mimeTypes;

FuzzyRules fuzzyRules;

mutable std::vector<entry_index_type> m_articleListByCluster;
mutable std::mutex m_articleListByClusterMutex;

Expand Down Expand Up @@ -148,6 +151,8 @@ namespace zim

const std::string& getMimeType(uint16_t idx) const;

const FuzzyRules& getFuzzyRules() const { return fuzzyRules; };

std::string getChecksum();
bool verify();
bool is_multiPart() const;
Expand All @@ -165,6 +170,7 @@ namespace zim
ClusterHandle readCluster(cluster_index_t idx);
offset_type getMimeListEndUpperLimit() const;
void readMimeTypes();
void readFuzzyRules();
void quickCheckForCorruptFile();

bool checkChecksum();
Expand Down

0 comments on commit 0a1ebc1

Please sign in to comment.