Skip to content

Commit

Permalink
storage/engine: expose FailOnMoreRecent MVCC{Get/Scan}Option
Browse files Browse the repository at this point in the history
Relates to cockroachdb#40205.

This change introduces a new option to `MVCCGetOptions` and `MVCCScanOptions`
that causes reads to throw an error if they observe an MVCC version at a
timestamp above their read timestamp. Specifically, when the option is enabled,
a `WriteTooOldError` will be returned if a scan observes an MVCC version with a
timestamp above the read. Similarly, a `WriteIntentError` will be returned if a
scan observes another transaction's intent, even if that intent has a timestamp
above the scan's read timestamp.

This option will be used in the future by `ScanRequests` and `ReverseScanRequests`
that intend to acquire unreplicated key-level locks, which will power `SELECT FOR
UPDATE`. These scans do not want to ignore existing MVCC versions at higher timestamps
like traditional scans do. Instead, they want to throw errors and force their
transaction to increase its timestamp through either a refresh or a retry. This was
previously prototyped in 4a8e8dc.

Interestingly, this is not new logic to the MVCC layer. This behavior is exactly
the same as that of the initial key-value lookup performed during MVCC writes
(see `mvccPutInternal`). It's fitting that behavior needed for `SELECT FOR UPDATE`
would mirror that already exhibited by the read portion of read-write operations.
This also hints at an opportunity to potentially use this option to merge the two
implementations and get rid of custom logic like `mvccGetInternal` that only exists
on the write path. We'd need to be careful about doing so though, as this code is
heavily tuned.

Release note: None
  • Loading branch information
nvanbenschoten committed Jan 30, 2020
1 parent 6e7fa8f commit 964640e
Show file tree
Hide file tree
Showing 9 changed files with 314 additions and 69 deletions.
5 changes: 3 additions & 2 deletions c-deps/libroach/include/libroach.h
Original file line number Diff line number Diff line change
Expand Up @@ -358,15 +358,16 @@ typedef struct {
DBStatus status;
DBChunkedBuffer data;
DBSlice intents;
DBTimestamp write_too_old_timestamp;
DBTimestamp uncertainty_timestamp;
DBSlice resume_key;
} DBScanResults;

DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn,
bool inconsistent, bool tombstones);
bool inconsistent, bool tombstones, bool fail_on_more_recent);
DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse,
bool tombstones);
bool tombstones, bool fail_on_more_recent);

// DBStatsResult contains various runtime stats for RocksDB.
typedef struct {
Expand Down
14 changes: 7 additions & 7 deletions c-deps/libroach/mvcc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -269,28 +269,28 @@ DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_
}

DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn,
bool inconsistent, bool tombstones) {
bool inconsistent, bool tombstones, bool fail_on_more_recent) {
// Get is implemented as a scan where we retrieve a single key. We specify an
// empty key for the end key which will ensure we don't retrieve a key
// different than the start key. This is a bit of a hack.
const DBSlice end = {0, 0};
ScopedStats scoped_iter(iter);
mvccForwardScanner scanner(iter, key, end, timestamp, 1 /* max_keys */, txn, inconsistent,
tombstones);
tombstones, fail_on_more_recent);
return scanner.get();
}

DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
int64_t max_keys, DBTxn txn, bool inconsistent, bool reverse,
bool tombstones) {
bool tombstones, bool fail_on_more_recent) {
ScopedStats scoped_iter(iter);
if (reverse) {
mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent,
tombstones);
mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, txn, inconsistent, tombstones,
fail_on_more_recent);
return scanner.scan();
} else {
mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent,
tombstones);
mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, txn, inconsistent, tombstones,
fail_on_more_recent);
return scanner.scan();
}
}
46 changes: 33 additions & 13 deletions c-deps/libroach/mvcc.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ static const int kMaxItersBeforeSeek = 10;
template <bool reverse> class mvccScanner {
public:
mvccScanner(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, int64_t max_keys,
DBTxn txn, bool inconsistent, bool tombstones)
DBTxn txn, bool inconsistent, bool tombstones, bool fail_on_more_recent)
: iter_(iter),
iter_rep_(iter->rep.get()),
start_key_(ToSlice(start)),
Expand All @@ -62,6 +62,7 @@ template <bool reverse> class mvccScanner {
txn_ignored_seqnums_(txn.ignored_seqnums),
inconsistent_(inconsistent),
tombstones_(tombstones),
fail_on_more_recent_(fail_on_more_recent),
check_uncertainty_(timestamp < txn.max_timestamp),
kvs_(new chunkedBuffer),
intents_(new rocksdb::WriteBatch),
Expand Down Expand Up @@ -254,6 +255,13 @@ template <bool reverse> class mvccScanner {
return true;
}

bool writeTooOldError(DBTimestamp ts) {
results_.write_too_old_timestamp = ts;
kvs_->Clear();
intents_->Clear();
return false;
}

bool uncertaintyError(DBTimestamp ts) {
results_.uncertainty_timestamp = ts;
kvs_->Clear();
Expand All @@ -276,8 +284,15 @@ template <bool reverse> class mvccScanner {
return addAndAdvance(cur_value_);
}

if (fail_on_more_recent_) {
// 2. Our txn's read timestamp is less than the most recent
// version's timestamp and the scanner has been configured
// to throw a write too old error on more recent versions.
return writeTooOldError(cur_timestamp_);
}

if (check_uncertainty_) {
// 2. Our txn's read timestamp is less than the max timestamp
// 3. Our txn's read timestamp is less than the max timestamp
// seen by the txn. We need to check for clock uncertainty
// errors.
if (txn_max_timestamp_ >= cur_timestamp_) {
Expand All @@ -288,7 +303,7 @@ template <bool reverse> class mvccScanner {
return seekVersion(txn_max_timestamp_, true);
}

// 3. Our txn's read timestamp is greater than or equal to the
// 4. Our txn's read timestamp is greater than or equal to the
// max timestamp seen by the txn so clock uncertainty checks are
// unnecessary. We need to seek to the desired version of the
// value (i.e. one with a timestamp earlier than our read
Expand All @@ -305,7 +320,7 @@ template <bool reverse> class mvccScanner {
}

if (meta_.has_raw_bytes()) {
// 4. Emit immediately if the value is inline.
// 5. Emit immediately if the value is inline.
return addAndAdvance(meta_.raw_bytes());
}

Expand All @@ -326,8 +341,12 @@ template <bool reverse> class mvccScanner {
// Intents for other transactions are visible at or below:
// max(txn.max_timestamp, read_timestamp)
const DBTimestamp max_visible_timestamp = check_uncertainty_ ? txn_max_timestamp_ : timestamp_;
if (max_visible_timestamp < meta_timestamp && !own_intent) {
// 5. The key contains an intent, but we're reading before the
// ... unless we're intending on failing on more recent writes,
// in which case other transaction's intents are always visible.
const bool other_intent_visible =
max_visible_timestamp >= meta_timestamp || fail_on_more_recent_;
if (!own_intent && !other_intent_visible) {
// 6. The key contains an intent, but we're reading before the
// intent. Seek to the desired version. Note that if we own the
// intent (i.e. we're reading transactionally) we want to read
// the intent regardless of our read timestamp and fall into
Expand All @@ -336,7 +355,7 @@ template <bool reverse> class mvccScanner {
}

if (inconsistent_) {
// 6. The key contains an intent and we're doing an inconsistent
// 7. The key contains an intent and we're doing an inconsistent
// read at a timestamp newer than the intent. We ignore the
// intent by insisting that the timestamp we're reading at is a
// historical timestamp < the intent timestamp. However, we
Expand All @@ -354,7 +373,7 @@ template <bool reverse> class mvccScanner {
}

if (!own_intent) {
// 7. The key contains an intent which was not written by our
// 8. The key contains an intent which was not written by our
// transaction and our read timestamp is newer than that of the
// intent. Note that this will trigger an error on the Go
// side. We continue scanning so that we can return all of the
Expand All @@ -365,13 +384,13 @@ template <bool reverse> class mvccScanner {

if (txn_epoch_ == meta_.txn().epoch()) {
if (txn_sequence_ >= meta_.txn().sequence() && !seqNumIsIgnored(meta_.txn().sequence())) {
// 8. We're reading our own txn's intent at an equal or higher sequence.
// 9. We're reading our own txn's intent at an equal or higher sequence.
// Note that we read at the intent timestamp, not at our read timestamp
// as the intent timestamp may have been pushed forward by another
// transaction. Txn's always need to read their own writes.
return seekVersion(meta_timestamp, false);
} else {
// 9. We're reading our own txn's intent at a lower sequence than is
// 10. We're reading our own txn's intent at a lower sequence than is
// currently present in the intent. This means the intent we're seeing
// was written at a higher sequence than the read and that there may or
// may not be earlier versions of the intent (with lower sequence
Expand All @@ -382,7 +401,7 @@ template <bool reverse> class mvccScanner {
if (found) {
return advanceKey();
}
// 10. If no value in the intent history has a sequence number equal to
// 11. If no value in the intent history has a sequence number equal to
// or less than the read, we must ignore the intents laid down by the
// transaction all together. We ignore the intent by insisting that the
// timestamp we're reading at is a historical timestamp < the intent
Expand All @@ -392,15 +411,15 @@ template <bool reverse> class mvccScanner {
}

if (txn_epoch_ < meta_.txn().epoch()) {
// 11. We're reading our own txn's intent but the current txn has
// 12. We're reading our own txn's intent but the current txn has
// an earlier epoch than the intent. Return an error so that the
// earlier incarnation of our transaction aborts (presumably
// this is some operation that was retried).
return setStatus(FmtStatus("failed to read with epoch %u due to a write intent with epoch %u",
txn_epoch_, meta_.txn().epoch()));
}

// 12. We're reading our own txn's intent but the current txn has a
// 13. We're reading our own txn's intent but the current txn has a
// later epoch than the intent. This can happen if the txn was
// restarted and an earlier iteration wrote the value we're now
// reading. In this case, we ignore the intent and read the
Expand Down Expand Up @@ -729,6 +748,7 @@ template <bool reverse> class mvccScanner {
const DBIgnoredSeqNums txn_ignored_seqnums_;
const bool inconsistent_;
const bool tombstones_;
const bool fail_on_more_recent_;
const bool check_uncertainty_;
DBScanResults results_;
std::unique_ptr<chunkedBuffer> kvs_;
Expand Down
11 changes: 11 additions & 0 deletions pkg/roachpb/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,17 @@ func (e *WriteIntentError) message(_ *Error) string {

var _ ErrorDetailInterface = &WriteIntentError{}

// NewWriteTooOldError creates a new write too old error. The function accepts
// the timestamp of the operation that hit the error, along with the timestamp
// immediately after the existing write which had a higher timestamp and which
// caused the error.
func NewWriteTooOldError(operationTS, actualTS hlc.Timestamp) *WriteTooOldError {
return &WriteTooOldError{
Timestamp: operationTS,
ActualTimestamp: actualTS,
}
}

func (e *WriteTooOldError) Error() string {
return e.message(nil)
}
Expand Down
62 changes: 38 additions & 24 deletions pkg/storage/engine/mvcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -707,9 +707,10 @@ func (b *getBuffer) release() {
// MVCCGetOptions bundles options for the MVCCGet family of functions.
type MVCCGetOptions struct {
// See the documentation for MVCCGet for information on these parameters.
Inconsistent bool
Tombstones bool
Txn *roachpb.Transaction
Inconsistent bool
Tombstones bool
FailOnMoreRecent bool
Txn *roachpb.Transaction
}

// MVCCGet returns the most recent value for the specified key whose timestamp
Expand All @@ -727,6 +728,12 @@ type MVCCGetOptions struct {
//
// Note that transactional gets must be consistent. Put another way, only
// non-transactional gets may be inconsistent.
//
// When reading in "fail on more recent" mode, a WriteTooOldError will be
// returned if the read observes a version with a timestamp above the read
// timestamp. Similarly, a WriteIntentError will be returned if the read
// observes another transaction's intent, even if it has a timestamp above
// the read timestamp.
func MVCCGet(
ctx context.Context, reader Reader, key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions,
) (*roachpb.Value, *roachpb.Intent, error) {
Expand Down Expand Up @@ -760,12 +767,13 @@ func mvccGet(
// specify an empty key for the end key which will ensure we don't retrieve a
// key different than the start key. This is a bit of a hack.
*mvccScanner = pebbleMVCCScanner{
parent: iter,
start: key,
ts: timestamp,
maxKeys: 1,
inconsistent: opts.Inconsistent,
tombstones: opts.Tombstones,
parent: iter,
start: key,
ts: timestamp,
maxKeys: 1,
inconsistent: opts.Inconsistent,
tombstones: opts.Tombstones,
failOnMoreRecent: opts.FailOnMoreRecent,
}

mvccScanner.init(opts.Txn)
Expand Down Expand Up @@ -1662,9 +1670,7 @@ func mvccPutInternal(
// instead of allowing their transactions to continue and be retried
// before committing.
writeTimestamp.Forward(metaTimestamp.Next())
maybeTooOldErr = &roachpb.WriteTooOldError{
Timestamp: readTimestamp, ActualTimestamp: writeTimestamp,
}
maybeTooOldErr = roachpb.NewWriteTooOldError(readTimestamp, writeTimestamp)
// If we're in a transaction, always get the value at the orig
// timestamp.
if txn != nil {
Expand Down Expand Up @@ -2310,14 +2316,15 @@ func mvccScanToBytes(
defer pebbleMVCCScannerPool.Put(mvccScanner)

*mvccScanner = pebbleMVCCScanner{
parent: iter,
reverse: opts.Reverse,
start: key,
end: endKey,
ts: timestamp,
maxKeys: max,
inconsistent: opts.Inconsistent,
tombstones: opts.Tombstones,
parent: iter,
reverse: opts.Reverse,
start: key,
end: endKey,
ts: timestamp,
maxKeys: max,
inconsistent: opts.Inconsistent,
tombstones: opts.Tombstones,
failOnMoreRecent: opts.FailOnMoreRecent,
}

mvccScanner.init(opts.Txn)
Expand Down Expand Up @@ -2411,10 +2418,11 @@ type MVCCScanOptions struct {
// to return no results.

// See the documentation for MVCCScan for information on these parameters.
Inconsistent bool
Tombstones bool
Reverse bool
Txn *roachpb.Transaction
Inconsistent bool
Tombstones bool
Reverse bool
FailOnMoreRecent bool
Txn *roachpb.Transaction
}

// MVCCScan scans the key range [key, endKey) in the provided reader up to some
Expand Down Expand Up @@ -2448,6 +2456,12 @@ type MVCCScanOptions struct {
//
// Note that transactional scans must be consistent. Put another way, only
// non-transactional scans may be inconsistent.
//
// When scanning in "fail on more recent" mode, a WriteTooOldError will be
// returned if the scan observes a version with a timestamp above the read
// timestamp. Similarly, a WriteIntentError will be returned if the scan
// observes another transaction's intent, even if it has a timestamp above
// the read timestamp.
func MVCCScan(
ctx context.Context,
reader Reader,
Expand Down
10 changes: 8 additions & 2 deletions pkg/storage/engine/mvcc_history_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ import (
//
// cput [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> v=<string> [raw] [cond=<string>]
// del [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key>
// get [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [inconsistent] [tombstones]
// get [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [inconsistent] [tombstones] [failOnMoreRecent]
// increment [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [inc=<val>]
// put [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> v=<string> [raw]
// scan [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [end=<key>] [inconsistent] [tombstones] [reverse]
// scan [t=<name>] [ts=<int>[,<int>]] [resolve [status=<txnstatus>]] k=<key> [end=<key>] [inconsistent] [tombstones] [reverse] [failOnMoreRecent]
//
// merge [ts=<int>[,<int>]] k=<key> v=<string> [raw]
//
Expand Down Expand Up @@ -598,6 +598,9 @@ func cmdGet(e *evalCtx) error {
if e.hasArg("tombstones") {
opts.Tombstones = true
}
if e.hasArg("failOnMoreRecent") {
opts.FailOnMoreRecent = true
}
val, intent, err := MVCCGet(e.ctx, e.engine, key, ts, opts)
if err != nil {
return err
Expand Down Expand Up @@ -691,6 +694,9 @@ func cmdScan(e *evalCtx) error {
if e.hasArg("reverse") {
opts.Reverse = true
}
if e.hasArg("failOnMoreRecent") {
opts.FailOnMoreRecent = true
}
max := int64(-1)
if e.hasArg("max") {
var imax int
Expand Down
Loading

0 comments on commit 964640e

Please sign in to comment.