Skip to content

Commit

Permalink
Bugfix/339 rados config timeout (#340)
Browse files Browse the repository at this point in the history
* 339 retry 10 times with random wait. retry read for simple read e.g. rados_config. fail with assert in case we can't find rados_config
  • Loading branch information
jrse authored Aug 25, 2022
1 parent 80969e4 commit 690127d
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 22 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Change Log

## [0.0.39](https://github.com/ceph-dovecot/dovecot-ceph-plugin/tree/0.0.39) (2022-08-25)
- #339 fail with assert if rados_config cannot be found due to network/connection issue
retry ceph read operations / read / xattr with timeout

## [0.0.38](https://github.com/ceph-dovecot/dovecot-ceph-plugin/tree/0.0.38) (2022-06-24)
- Fix losing \r when saving mail from \n source

Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
AC_PREREQ([2.59])


AC_INIT([dovecot-ceph-plugin], [0.0.38], [https://github.com/ceph-dovecot/dovecot-ceph-plugin/issues/new], ,[https://github.com/ceph-dovecot/dovecot-ceph-plugin])
AC_INIT([dovecot-ceph-plugin], [0.0.39], [https://github.com/ceph-dovecot/dovecot-ceph-plugin/issues/new], ,[https://github.com/ceph-dovecot/dovecot-ceph-plugin])


AC_CONFIG_AUX_DIR([.])
Expand Down
2 changes: 1 addition & 1 deletion rpm/dovecot-ceph-plugin.spec
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Name: dovecot-ceph-plugin
Summary: Dovecot Ceph RADOS plugins


Version: 0.0.38
Version: 0.0.39

Release: 0%{?dist}
URL: https://github.com/ceph-dovecot/dovecot-ceph-plugin
Expand Down
18 changes: 17 additions & 1 deletion src/librmb/rados-ceph-config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "rados-ceph-config.h"
#include <jansson.h>
#include <climits>
#include <unistd.h>

namespace librmb {

Expand Down Expand Up @@ -111,7 +112,22 @@ int RadosCephConfig::read_object(const std::string &oid, librados::bufferlist *b
if (io_ctx == nullptr) {
return -1;
}
return io_ctx->read(oid, *buffer, max, 0);
// retry max times to read the object.
int max_retry = 10;
int ret_read = -1;

for(int i = 0;i<max_retry;i++){
ret_read = io_ctx->read(oid, *buffer, max, 0);
if(ret_read >= 0 || ret_read == -ENOENT ){
// exit here if the file does not exist, or we were successful
break;
}
buffer->clear();
// wait random time before try again!!
usleep(((rand() % 5) + 1) * 10000);
}

return ret_read;
}

void RadosCephConfig::set_io_ctx_namespace(const std::string &namespace_) {
Expand Down
16 changes: 15 additions & 1 deletion src/librmb/rados-metadata-storage-ima.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "rados-util.h"
#include <string.h>
#include <utility>
#include <unistd.h>

std::string librmb::RadosMetadataStorageIma::module_name = "ima";
std::string librmb::RadosMetadataStorageIma::keyword_key = "K";
Expand Down Expand Up @@ -65,7 +66,18 @@ int RadosMetadataStorageIma::load_metadata(RadosMail *mail) {
}

std::map<string, ceph::bufferlist> attr;
int ret = io_ctx->getxattrs(*mail->get_oid(), attr);
// retry mechanism ..
int max_retry = 10;
int ret = -1;
for(int i=0;i<max_retry;i++){
ret = io_ctx->getxattrs(*mail->get_oid(), attr);
if(ret >= 0){
break;
}
// wait random time before try again!!
usleep(((rand() % 5) + 1) * 10000);
}

if (ret < 0) {
return ret;
}
Expand Down Expand Up @@ -180,6 +192,8 @@ bool RadosMetadataStorageIma::update_metadata(const std::string &oid, std::list<
// write update
save_metadata(&write_op, &obj);
librados::AioCompletion *completion = librados::Rados::aio_create_completion();

//TODO: do we need a retry mechanism here?
int ret = io_ctx->aio_operate(oid, completion, &write_op);
completion->wait_for_complete();
completion->release();
Expand Down
1 change: 1 addition & 0 deletions src/storage-rbox/rbox-copy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ int rbox_mail_storage_copy(struct mail_save_context *ctx, struct mail *mail) {

if (rbox_open_rados_connection(dest_mbox, alt_storage) < 0) {
FUNC_END_RET("ret == -1, connection to rados failed");
i_error("ERROR, cannot open rados connection (rbox_mail_storage_copy)");
return -1;
}

Expand Down
26 changes: 9 additions & 17 deletions src/storage-rbox/rbox-mail.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <map>
#include <string>
#include <iostream>
#include <unistd.h>

extern "C" {

Expand Down Expand Up @@ -158,21 +159,6 @@ static int rbox_mail_metadata_get(struct rbox_mail *rmail, enum rbox_metadata_ke
//i_debug("Errorcode: process %d returned with %d cannot get x_attr(%s,%c) from rados_object: %s",getpid(), ret_load_metadata,
// metadata_key.c_str(), key, rmail->rados_mail != NULL ? rmail->rados_mail->to_string(" ").c_str() : " no rados_mail");
rbox_mail_set_expunged(rmail);
} else if(ret_load_metadata == -ETIMEDOUT) {
int max_retry = 10;
for(int i=0;i<max_retry;i++){
ret_load_metadata = r_storage->ms->get_storage()->load_metadata(rmail->rados_mail);
if(ret_load_metadata>=0){
i_error("READ TIMEOUT %d reading mail object %s ", ret_load_metadata,rmail->rados_mail != NULL ? rmail->rados_mail->to_string(" ").c_str() : " no rados_mail");
break;
}
i_warning("READ TIMEOUT retry(%d) %d reading mail object %s ",i, ret_load_metadata,rmail->rados_mail != NULL ? rmail->rados_mail->to_string(" ").c_str() : " no rados_mail");
}
if(ret_load_metadata<0){
FUNC_END();
return -1;
}

}
else {
i_error("Errorcode: process %d returned with %d cannot get x_attr(%s,%c) from rados_object: %s",getpid(), ret_load_metadata,
Expand Down Expand Up @@ -300,6 +286,7 @@ static int rbox_mail_get_save_date(struct mail *_mail, time_t *date_r) {

if (rbox_open_rados_connection(_mail->box, alt_storage) < 0) {
FUNC_END_RET("ret == -1; connection to rados failed");
i_error("ERROR, cannot open rados connection (rbox_mail_get_save_date)");
return -1;
}

Expand Down Expand Up @@ -465,6 +452,7 @@ static int read_mail_from_storage(librmb::RadosStorage *rados_storage,
read_mail->read(0, INT_MAX, rmail->rados_mail->get_mail_buffer(), &read_err);
read_mail->stat(psize, save_date, &stat_err);

//TODO: refactore to use operate instead of aio_operate.
librados::AioCompletion *completion = librados::Rados::aio_create_completion();
int ret = rados_storage->get_io_ctx().aio_operate(*rmail->rados_mail->get_oid(), completion, read_mail,
rmail->rados_mail->get_mail_buffer());
Expand All @@ -489,6 +477,7 @@ static int rbox_mail_get_stream(struct mail *_mail, bool get_body ATTR_UNUSED, s
if (data->stream == NULL) {
if (rbox_open_rados_connection(_mail->box, alt_storage) < 0) {
FUNC_END_RET("ret == -1; connection to rados failed");
i_error("ERROR, cannot open rados connection (rbox_mail_get_stream)");
return -1;
}

Expand Down Expand Up @@ -552,6 +541,8 @@ static int rbox_mail_get_stream(struct mail *_mail, bool get_body ATTR_UNUSED, s
break;
}
i_warning("READ TIMEOUT retry(%d) %d reading mail object %s ",i, ret,rmail->rados_mail != NULL ? rmail->rados_mail->to_string(" ").c_str() : " no rados_mail");
// wait random time before try again!!
usleep(((rand() % 5) + 1) * 10000);
}

if(ret <0){
Expand Down Expand Up @@ -652,8 +643,9 @@ static int rbox_get_cached_metadata(struct rbox_mail *mail, enum rbox_metadata_k
unsigned int order = 0;

string_t *str = str_new(imail->mail.data_pool, 64);
if (mail_cache_lookup_field(imail->mail.mail.transaction->cache_view, str, imail->mail.mail.seq,
ibox->cache_fields[cache_field].idx) > 0) {
if (mail_cache_lookup_field(imail->mail.mail.transaction->cache_view,
str, imail->mail.mail.seq,
ibox->cache_fields[cache_field].idx) > 0) {
if (cache_field == MAIL_CACHE_POP3_ORDER) {
i_assert(str_len(str) == sizeof(order));
memcpy(&order, str_data(str), sizeof(order));
Expand Down
4 changes: 3 additions & 1 deletion src/storage-rbox/rbox-storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,9 @@ int rbox_open_rados_connection(struct mailbox *box, bool alt_storage) {
ret = rbox->storage->config->save_default_rados_config();
}
if (ret < 0) {
i_error("unable to read rados_config return value : %d", ret);
// connection seems to be up, but read to object store is not okay. We can only fail hard!
i_error("unrecoverable, we cannot proceed without rados_config ceph returned : %d", ret);
assert(ret == 0);
return ret;
}
rbox->storage->ms->create_metadata_storage(&rbox->storage->s->get_io_ctx(), rbox->storage->config);
Expand Down
3 changes: 3 additions & 0 deletions src/storage-rbox/rbox-sync.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <string>
#include <rados/librados.hpp>
#include <list>
#include <unistd.h>

extern "C" {
#include "dovecot-all.h"
Expand Down Expand Up @@ -474,6 +475,8 @@ static int rbox_sync_object_expunge(struct rbox_sync_context *ctx, struct expung
break;
}
i_warning("rbox_sync (retry %d) deletion failed with %d during oid (%s) deletion, mail stays in object store.",i, ret_remove, oid);
// wait random time before try again!!
usleep(((rand() % 5) + 1) * 10000);
}

}
Expand Down

0 comments on commit 690127d

Please sign in to comment.