From 0a1ebc1dc02d6ec4e62c7758154b940b9f35f4f2 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Fri, 3 Nov 2023 18:34:14 +0100 Subject: [PATCH] Introduce `getEntryByUrl` to search for entry using url and fuzzy rules. --- include/zim/archive.h | 14 ++++++++++++++ src/archive.cpp | 18 ++++++++++++++++++ src/fileimpl.cpp | 18 ++++++++++++++++++ src/fileimpl.h | 6 ++++++ 4 files changed, 56 insertions(+) diff --git a/include/zim/archive.h b/include/zim/archive.h index 7539b96c6..ebf57a400 100644 --- a/include/zim/archive.h +++ b/include/zim/archive.h @@ -222,6 +222,7 @@ namespace zim * * Get an entry using its path. * The path must contains the namespace. + * Path must be exact. Either a entry exists with the path or not. * * @param path The entry's path. * @return The Entry. @@ -229,6 +230,19 @@ namespace zim */ Entry getEntryByPath(const std::string& path) const; + /** Get an entry using a url. + * + * Get an entry using a full url. + * A full url is composed of a path and a querystring (separated by a '?') + * If the path is not in the archive, libzim will try find the entry + * using a combination of the querystring and fuzzyRules stored in the archive. + * + * @param url The url (including querystring) to search for. + * @return The Entry. + * @exception EntryNotFound If no entry has been found. + */ + Entry getEntryByUrl(const std::string& url) const; + /** Get an entry using its "title" index. * * Use the index of the entry to get the idx'th entry diff --git a/src/archive.cpp b/src/archive.cpp index 17f33157d..7f3980686 100644 --- a/src/archive.cpp +++ b/src/archive.cpp @@ -250,6 +250,24 @@ namespace zim throw EntryNotFound("Cannot find entry"); } + Entry Archive::getEntryByUrl(const std::string& url) const + { + const auto [path, queryParams] = urlSplit(url); + try { + return getEntryByPath(path); + } catch (const EntryNotFound& e) { + for(const auto& path_to_try: m_impl->getFuzzyRules().get_fuzzy_paths(path, queryParams)) { + try { + return getEntryByPath(path_to_try); + } catch (const EntryNotFound& e) { + continue; + } + } + } + + throw EntryNotFound("Cannot find entry"); + } + Entry Archive::getEntryByTitle(entry_index_type idx) const { return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx)))); diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp index 152f98aa0..2529dd203 100644 --- a/src/fileimpl.cpp +++ b/src/fileimpl.cpp @@ -36,6 +36,7 @@ #include "envvalue.h" #include "md5.h" #include "tools.h" +#include "fuzzy_rules.h" log_define("zim.file.impl") @@ -236,6 +237,7 @@ class Grouping m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); readMimeTypes(); + readFuzzyRules(); } std::unique_ptr FileImpl::getTitleAccessor(const std::string& path) @@ -365,6 +367,22 @@ class Grouping } } + void FileImpl::readFuzzyRules() { + auto r = findx('M', "FuzzyRules"); + if (!r.first) { + // No rules + return; + } + auto fuzzy_rule_dirent = getDirent(r.second); + if (fuzzy_rule_dirent->isRedirect()) { + std::cerr << "Error: 'M/FuzzyRules' is a redirect." << std::endl; + return; + } + auto cluster = getCluster(fuzzy_rule_dirent->getClusterNumber()); + auto blob = cluster->getBlob(fuzzy_rule_dirent->getBlobNumber()); + fuzzyRules = FuzzyRules(blob); + } + FileImpl::FindxResult FileImpl::findx(char ns, const std::string& url) { return direntLookup().find(ns, url); diff --git a/src/fileimpl.h b/src/fileimpl.h index cc85c46d6..a7d352905 100644 --- a/src/fileimpl.h +++ b/src/fileimpl.h @@ -40,6 +40,7 @@ #include "fileheader.h" #include "zim_types.h" #include "direntreader.h" +#include "fuzzy_rules.h" namespace zim @@ -68,6 +69,8 @@ namespace zim typedef std::vector MimeTypes; MimeTypes mimeTypes; + FuzzyRules fuzzyRules; + mutable std::vector m_articleListByCluster; mutable std::mutex m_articleListByClusterMutex; @@ -148,6 +151,8 @@ namespace zim const std::string& getMimeType(uint16_t idx) const; + const FuzzyRules& getFuzzyRules() const { return fuzzyRules; }; + std::string getChecksum(); bool verify(); bool is_multiPart() const; @@ -165,6 +170,7 @@ namespace zim ClusterHandle readCluster(cluster_index_t idx); offset_type getMimeListEndUpperLimit() const; void readMimeTypes(); + void readFuzzyRules(); void quickCheckForCorruptFile(); bool checkChecksum();