Skip to content

Commit

Permalink
Merge pull request #539 from openzim/counter_metadata
Browse files Browse the repository at this point in the history
Counter metadata
  • Loading branch information
kelson42 authored Apr 21, 2021
2 parents a6fee67 + cb56d5f commit 91eab3b
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 19 deletions.
3 changes: 2 additions & 1 deletion src/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ common_sources = [
'writer/dirent.cpp',
'writer/workers.cpp',
'writer/clusterWorker.cpp',
'writer/titleListingHandler.cpp'
'writer/titleListingHandler.cpp',
'writer/counterHandler.cpp'
]

if host_machine.system() == 'windows'
Expand Down
71 changes: 71 additions & 0 deletions src/writer/counterHandler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Copyright 2021 Matthieu Gautier <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/

#include "counterHandler.h"
#include "creatordata.h"

#include <zim/writer/contentProvider.h>
#include <zim/blob.h>

using namespace zim::writer;

CounterHandler::CounterHandler(CreatorData* data)
: mp_creatorData(data)
{}

CounterHandler::~CounterHandler() = default;

void CounterHandler::start() {
}

void CounterHandler::stop() {
}

Dirent* CounterHandler::createDirent() const {
return mp_creatorData->createDirent('M', "Counter", "text/plain", "");
}

std::unique_ptr<ContentProvider> CounterHandler::getContentProvider() const {
std::stringstream ss;
bool first = true;
for(auto pair: m_mimetypeCounter) {
if (! first) {
ss << ";";
}
ss << pair.first << "=" << pair.second;
first = false;
}
return std::unique_ptr<ContentProvider>(new StringProvider(ss.str()));
}

void CounterHandler::handle(Dirent* dirent, const Hints& hints)
{
}

void CounterHandler::handle(Dirent* dirent, std::shared_ptr<Item> item)
{
if (dirent->getNamespace() != 'C') {
return;
}
auto mimetype = item->getMimeType();
if (mimetype.empty()) {
return;
}
m_mimetypeCounter[mimetype] += 1;
}
54 changes: 54 additions & 0 deletions src/writer/counterHandler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright 2021 Matthieu Gautier <[email protected]>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/

#ifndef OPENZIM_LIBZIM_COUNTER_HANDLER_H
#define OPENZIM_LIBZIM_COUNTER_HANDLER_H

#include "handler.h"

#include <map>

namespace zim {
namespace writer {


class CounterHandler : public DirentHandler {
public:
typedef std::map<std::string, entry_index_type> Counter;

explicit CounterHandler(CreatorData* data);
virtual ~CounterHandler();

void start() override;
void stop() override;
bool isCompressible() override { return true; }
std::unique_ptr<ContentProvider> getContentProvider() const override;
void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
void handle(Dirent* dirent, const Hints& hints) override;

private:
Dirent* createDirent() const override;
CreatorData* mp_creatorData;
Counter m_mimetypeCounter;
};

}
}

#endif // OPENZIM_LIBZIM_COUNTER_HANDLER_H
4 changes: 3 additions & 1 deletion src/writer/creator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <algorithm>
#include <fstream>
#include "../md5.h"
#include "counterHandler.h"

#if defined(ENABLE_XAPIAN)
# include "xapianHandler.h"
Expand Down Expand Up @@ -244,7 +245,7 @@ namespace zim
handler->stop();
auto dirent = handler->getDirent();
auto provider = handler->getContentProvider();
data->addItemData(dirent, std::move(provider), false);
data->addItemData(dirent, std::move(provider), handler->isCompressible());
if (handler == data->mp_titleListingHandler) {
// We have to get the offset of the titleList in the cluster before
// we close the cluster. Once the cluster is close, the offset information is dropped.
Expand Down Expand Up @@ -442,6 +443,7 @@ namespace zim
mp_titleListingHandler = std::make_shared<TitleListingHandler>(this);
m_direntHandlers.push_back(mp_titleListingHandler);
m_direntHandlers.push_back(std::make_shared<TitleListingHandlerV1>(this));
m_direntHandlers.push_back(std::make_shared<CounterHandler>(this));

for(auto& handler:m_direntHandlers) {
handler->start();
Expand Down
1 change: 1 addition & 0 deletions src/writer/handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class DirentHandler {

virtual void start() = 0;
virtual void stop() = 0;
virtual bool isCompressible() = 0;
Dirent* getDirent() {
if (!mp_dirent) {
mp_dirent = createDirent();
Expand Down
1 change: 1 addition & 0 deletions src/writer/titleListingHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class TitleListingHandler : public DirentHandler {

void start() override;
void stop() override;
bool isCompressible() override { return false; }
std::unique_ptr<ContentProvider> getContentProvider() const override;
void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
void handle(Dirent* dirent, const Hints& hints) override;
Expand Down
2 changes: 2 additions & 0 deletions src/writer/xapianHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class FullTextXapianHandler : public DirentHandler {

void start() override;
void stop() override;
bool isCompressible() override { return false; }
std::unique_ptr<ContentProvider> getContentProvider() const override;
void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
void handle(Dirent* dirent, const Hints& hints) override;
Expand All @@ -53,6 +54,7 @@ class TitleXapianHandler : public DirentHandler {

void start() override;
void stop() override;
bool isCompressible() override { return false; }
std::unique_ptr<ContentProvider> getContentProvider() const override;
void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
void handle(Dirent* dirent, const Hints& hints) override;
Expand Down
47 changes: 30 additions & 17 deletions test/creator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,14 @@ TEST(ZimCreator, createEmptyZim)
header.read(*reader);
ASSERT_FALSE(header.hasMainPage());
#if defined(ENABLE_XAPIAN)
entry_index_type nb_entry = 3; // xapiantitleIndex and titleListIndexes (*2)
entry_index_type nb_entry = 4; // counter + xapiantitleIndex and titleListIndexes (*2)
int xapian_mimetype = 0;
int listing_mimetype = 1;
int plain_mimetype = 2;
#else
entry_index_type nb_entry = 2; // titleListIndexes (*2)
entry_index_type nb_entry = 3; // counter + titleListIndexes (*2)
int listing_mimetype = 0;
int plain_mimetype = 1;
#endif
ASSERT_EQ(header.getArticleCount(), nb_entry);

Expand All @@ -129,23 +131,26 @@ TEST(ZimCreator, createEmptyZim)
std::shared_ptr<const Dirent> dirent;

dirent = direntAccessor.getDirent(entry_index_t(0));
test_article_dirent(dirent, 'X', "listing/titleOrdered/v0", None, listing_mimetype, cluster_index_t(0), None);
auto v0BlobIndex = dirent->getBlobNumber();
test_article_dirent(dirent, 'M', "Counter", None, plain_mimetype, cluster_index_t(0), None);

dirent = direntAccessor.getDirent(entry_index_t(1));
test_article_dirent(dirent, 'X', "listing/titleOrdered/v1", None, listing_mimetype, cluster_index_t(0), None);
test_article_dirent(dirent, 'X', "listing/titleOrdered/v0", None, listing_mimetype, cluster_index_t(1), None);
auto v0BlobIndex = dirent->getBlobNumber();

dirent = direntAccessor.getDirent(entry_index_t(2));
test_article_dirent(dirent, 'X', "listing/titleOrdered/v1", None, listing_mimetype, cluster_index_t(1), None);
auto v1BlobIndex = dirent->getBlobNumber();

#if defined(ENABLE_XAPIAN)
dirent = direntAccessor.getDirent(entry_index_t(2));
test_article_dirent(dirent, 'X', "title/xapian", None, xapian_mimetype, cluster_index_t(0), None);
dirent = direntAccessor.getDirent(entry_index_t(3));
test_article_dirent(dirent, 'X', "title/xapian", None, xapian_mimetype, cluster_index_t(1), None);
#endif

auto clusterPtrPos = header.getClusterPtrPos();
auto clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos)));
auto clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos+8)));
auto cluster = Cluster::read(*reader, clusterOffset);
ASSERT_EQ(cluster->getCompression(), CompressionType::zimcompNone);
ASSERT_EQ(cluster->count(), blob_index_t(nb_entry));
ASSERT_EQ(cluster->count(), blob_index_t(nb_entry-1)); // 1 entry is not compressed
auto blob = cluster->getBlob(v0BlobIndex);
ASSERT_EQ(blob.size(), nb_entry*sizeof(title_index_t));
blob = cluster->getBlob(v1BlobIndex);
Expand Down Expand Up @@ -205,13 +210,13 @@ TEST(ZimCreator, createZim)
header.read(*reader);
ASSERT_TRUE(header.hasMainPage());
#if defined(ENABLE_XAPIAN)
entry_index_type nb_entry = 9; // xapiantitleIndex + xapianfulltextIndex + foo + foo2 + foo3 + Title + mainPage + titleListIndexes*2
entry_index_type nb_entry = 10; // counter + xapiantitleIndex + xapianfulltextIndex + foo + foo2 + foo3 + Title + mainPage + titleListIndexes*2
int xapian_mimetype = 0;
int listing_mimetype = 1;
int html_mimetype = 2;
int plain_mimetype = 3;
#else
entry_index_type nb_entry = 7; // foo + foo2 + foo3 + Title + mainPage + titleListIndexes*2
entry_index_type nb_entry = 8; // counter + foo + foo2 + foo3 + Title + mainPage + titleListIndexes*2
int listing_mimetype = 0;
int html_mimetype = 1;
int plain_mimetype = 2;
Expand All @@ -236,9 +241,13 @@ TEST(ZimCreator, createZim)
dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
test_redirect_dirent(dirent, 'C', "foo3", "FooRedirection", entry_index_t(0));

dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
test_article_dirent(dirent, 'M', "Counter", None, plain_mimetype, cluster_index_t(0), None);
auto counterBlobIndex = dirent->getBlobNumber();

dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
test_article_dirent(dirent, 'M', "Title", "Title", plain_mimetype, cluster_index_t(0), None);
auto metaBlobIndex = dirent->getBlobNumber();
auto titleBlobIndex = dirent->getBlobNumber();

dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
test_redirect_dirent(dirent, 'W', "mainPage", "mainPage", entry_index_t(0));
Expand Down Expand Up @@ -267,23 +276,26 @@ TEST(ZimCreator, createZim)
auto clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos)));
auto cluster = Cluster::read(*reader, clusterOffset);
ASSERT_EQ(cluster->getCompression(), CompressionType::zimcompZstd);
ASSERT_EQ(cluster->count(), blob_index_t(3));
ASSERT_EQ(cluster->count(), blob_index_t(4)); // 4 entries are compressed content

auto blob = cluster->getBlob(fooBlobIndex);
ASSERT_EQ(std::string(blob), "FooContent");

blob = cluster->getBlob(foo2BlobIndex);
ASSERT_EQ(std::string(blob), "Foo2Content");

blob = cluster->getBlob(metaBlobIndex);
blob = cluster->getBlob(titleBlobIndex);
ASSERT_EQ(std::string(blob), "This is a title");

blob = cluster->getBlob(counterBlobIndex);
ASSERT_EQ(std::string(blob), "text/html=2");


// Test listing content
clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos + 8)));
cluster = Cluster::read(*reader, clusterOffset);
ASSERT_EQ(cluster->getCompression(), CompressionType::zimcompNone);
ASSERT_EQ(cluster->count(), blob_index_t(nb_entry-5)); // 5 entries are not content entries
ASSERT_EQ(cluster->count(), blob_index_t(nb_entry-6)); // 6 entries are either compressed or redirections

blob = cluster->getBlob(v0BlobIndex);
ASSERT_EQ(blob.size(), nb_entry*sizeof(title_index_t));
Expand All @@ -295,10 +307,11 @@ TEST(ZimCreator, createZim)
3, 0, 0, 0,
4, 0, 0, 0,
5, 0, 0, 0,
6, 0, 0, 0
6, 0, 0, 0,
7, 0, 0, 0
#if defined(ENABLE_XAPIAN)
,7, 0, 0, 0
,8, 0, 0, 0
,9, 0, 0, 0
#endif
};
ASSERT_EQ(blob0Data, expectedBlob0Data);
Expand Down

0 comments on commit 91eab3b

Please sign in to comment.