From 036d3e72b8911441610a7c619b3d9c74092dd6f7 Mon Sep 17 00:00:00 2001 From: Ibrahim Jarif Date: Tue, 23 Apr 2019 21:29:04 +0530 Subject: [PATCH] Set readTs in write batch to latest done readTs so we can discard older versions of keys during compactions. Every Transaction stores the latest value of `readTs` it is aware of. When the transaction is discarded (which happens even when we commit), the global value of `readMark` is updated. https://github.com/dgraph-io/badger/blob/1fcc96ecdb66d221df85cddec186b6ac7b6dab4b/txn.go#L501-L503 Previously, the `readTs` of transaction inside the write batch struct was set to 0. So the global value of `readMark` would also be set to 0 (unless someone ran a transaction after using write batch). Due to the 0 value of the global `readMark`, the compaction algorithm would skip all the values which were inserted in the write batch call. See https://github.com/dgraph-io/badger/blob/1fcc96ecdb66d221df85cddec186b6ac7b6dab4b/levels.go#L480-L484 and https://github.com/dgraph-io/badger/blob/1fcc96ecdb66d221df85cddec186b6ac7b6dab4b/txn.go#L138-L145 The `o.readMark.DoneUntil()` call would always return `0` and so the compaction wouldn't compact the newer values. With this commit, the compaction algorithm works as expected with key-values inserted via Transaction API or via the Write Batch API. See https://github.com/dgraph-io/badger/issues/767 --- batch.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/batch.go b/batch.go index 2c26d4b07..2187f9c6d 100644 --- a/batch.go +++ b/batch.go @@ -36,7 +36,14 @@ type WriteBatch struct { // creating and committing transactions. Due to the nature of SSI guaratees provided by Badger, // blind writes can never encounter transaction conflicts (ErrConflict). func (db *DB) NewWriteBatch() *WriteBatch { - return &WriteBatch{db: db, txn: db.newTransaction(true, true)} + txn := db.newTransaction(true, true) + // If we let it stay at zero, compactions would not allow older key versions to be deleted, + // because the read timestamps of pending txns, would be zero. Therefore, we set it to the + // maximum read timestamp that's done. This allows compactions to discard key versions below + // this read timestamp, while also not blocking on pending txns to finish before starting this + // one. + txn.readTs = db.orc.readMark.DoneUntil() + return &WriteBatch{db: db, txn: txn} } // Cancel function must be called if there's a chance that Flush might not get @@ -128,7 +135,8 @@ func (wb *WriteBatch) commit() error { wb.wg.Add(1) wb.txn.CommitWith(wb.callback) wb.txn = wb.db.newTransaction(true, true) - wb.txn.readTs = 0 // We're not reading anything. + // See comment about readTs in NewWriteBatch. + wb.txn.readTs = wb.db.orc.readMark.DoneUntil() return wb.err }