Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(baseapp): fix race condition in state #11102

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions baseapp/abci.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,18 @@ func (app *BaseApp) InitChain(req abci.RequestInitChain) (res abci.ResponseInitC
// done after the deliver state and context have been set as it's persisted
// to state.
if req.ConsensusParams != nil {
app.StoreConsensusParams(app.deliverState.ctx, req.ConsensusParams)
app.StoreConsensusParams(app.deliverState.Context(), req.ConsensusParams)
}

if app.initChainer == nil {
return
}

// add block gas meter for any genesis transactions (allow infinite gas)
app.deliverState.ctx = app.deliverState.ctx.WithBlockGasMeter(sdk.NewInfiniteGasMeter())
ctx := app.deliverState.Context().WithBlockGasMeter(sdk.NewInfiniteGasMeter())
app.deliverState.WithContext(ctx)

res = app.initChainer(app.deliverState.ctx, req)
res = app.initChainer(app.deliverState.Context(), req)

// sanity check
if len(req.Validators) > 0 {
Expand Down Expand Up @@ -154,44 +155,47 @@ func (app *BaseApp) BeginBlock(req abci.RequestBeginBlock) (res abci.ResponseBeg
} else {
// In the first block, app.deliverState.ctx will already be initialized
// by InitChain. Context is now updated with Header information.
app.deliverState.ctx = app.deliverState.ctx.
ctx := app.deliverState.Context().
WithBlockHeader(req.Header).
WithBlockHeight(req.Header.Height)
app.deliverState.WithContext(ctx)
}

// add block gas meter
var gasMeter sdk.GasMeter
if maxGas := app.getMaximumBlockGas(app.deliverState.ctx); maxGas > 0 {
if maxGas := app.getMaximumBlockGas(app.deliverState.Context()); maxGas > 0 {
gasMeter = sdk.NewGasMeter(maxGas)
} else {
gasMeter = sdk.NewInfiniteGasMeter()
}

// NOTE: header hash is not set in NewContext, so we manually set it here

app.deliverState.ctx = app.deliverState.ctx.
ctx := app.deliverState.Context().
WithBlockGasMeter(gasMeter).
WithHeaderHash(req.Hash).
WithConsensusParams(app.GetConsensusParams(app.deliverState.ctx))
app.deliverState.WithContext(ctx)

// we also set block gas meter to checkState in case the application needs to
// verify gas consumption during (Re)CheckTx
if app.checkState != nil {
app.checkState.ctx = app.checkState.ctx.
ctx := app.checkState.Context().
WithBlockGasMeter(gasMeter).
WithHeaderHash(req.Hash)
app.checkState.WithContext(ctx)
}

if app.beginBlocker != nil {
res = app.beginBlocker(app.deliverState.ctx, req)
res = app.beginBlocker(app.deliverState.Context(), req)
res.Events = sdk.MarkEventsToIndex(res.Events, app.indexEvents)
}
// set the signed validators for addition to context in deliverTx
app.voteInfos = req.LastCommitInfo.GetVotes()

// call the hooks with the BeginBlock messages
for _, streamingListener := range app.abciListeners {
if err := streamingListener.ListenBeginBlock(app.deliverState.ctx, req, res); err != nil {
if err := streamingListener.ListenBeginBlock(app.deliverState.Context(), req, res); err != nil {
app.logger.Error("BeginBlock listening hook failed", "height", req.Header.Height, "err", err)
}
}
Expand Down Expand Up @@ -301,7 +305,7 @@ func (app *BaseApp) DeliverTx(req abci.RequestDeliverTx) abci.ResponseDeliverTx
// height.
func (app *BaseApp) Commit() (res abci.ResponseCommit) {

header := app.deliverState.ctx.BlockHeader()
header := app.deliverState.Context().BlockHeader()
retainHeight := app.GetBlockRetentionHeight(header.Height)

// Write the DeliverTx state into branched storage and commit the MultiStore.
Expand Down Expand Up @@ -648,7 +652,7 @@ func (app *BaseApp) createQueryContext(height int64, prove bool) (sdk.Context, e

// branch the commit-multistore for safety
ctx := sdk.NewContext(
cacheMS, app.checkState.ctx.BlockHeader(), true, app.logger,
cacheMS, app.checkState.Context().BlockHeader(), true, app.logger,
).WithMinGasPrices(app.minGasPrices).WithBlockHeight(height)

return ctx, nil
Expand Down Expand Up @@ -704,7 +708,7 @@ func (app *BaseApp) GetBlockRetentionHeight(commitHeight int64) int64 {
// evidence parameters instead of computing an estimated nubmer of blocks based
// on the unbonding period and block commitment time as the two should be
// equivalent.
cp := app.GetConsensusParams(app.deliverState.ctx)
cp := app.GetConsensusParams(app.deliverState.Context())
if cp != nil && cp.Evidence != nil && cp.Evidence.MaxAgeNumBlocks > 0 {
retentionHeight = commitHeight - cp.Evidence.MaxAgeNumBlocks
}
Expand Down
17 changes: 15 additions & 2 deletions baseapp/state.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
package baseapp

import (
"sync"

sdk "github.com/cosmos/cosmos-sdk/types"
)

type state struct {
ms sdk.CacheMultiStore
ctx sdk.Context
lock sync.RWMutex
ms sdk.CacheMultiStore
ctx sdk.Context
}

// CacheMultiStore calls and returns a CacheMultiStore on the state's underling
Expand All @@ -17,5 +20,15 @@ func (st *state) CacheMultiStore() sdk.CacheMultiStore {

// Context returns the Context of the state.
func (st *state) Context() sdk.Context {
defer st.lock.RUnlock()
st.lock.RLock()
Comment on lines +23 to +24

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
defer st.lock.RUnlock()
st.lock.RLock()
st.lock.RLock()
defer st.lock.RUnlock()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment why?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have to lock the mutex before you can unlock it :)


return st.ctx

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not actually safe, as the Context type has fields which have reference semantics.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ouch

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peterbourgon it seems the Context type needs some sort of deep copy.
thoughts

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tho more I dig into it - the more I am convinced this is a dirty hack and eventually it will blow
BeginBlock for example
if there are two servers listening (socket and grpc) they may (and will) call BeginBlock simultaneously.
correct if I'm getting it wrong

Copy link

@peterbourgon peterbourgon Mar 30, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep! I think you're right.

As far as I can see, BaseApp's exported methods — including but not limited to BeginBlock — can absolutely be called by concurrent goroutines. This means that they must ensure that anything they read or write is synchronized. But that's not happening. BaseApp —and many, many, many other components in the SDK — permit unsynchronized reads and writes on their encapsulated values, and consequently violate Go's memory model. Many of these soundness errors remain undetected or overlooked because the current, specific execution paths happen to not trigger them most of the time.

There are a lot of pathological issues in the Cosmos SDK, as well as the sdk.Context type specifically, which make this kind of bug hard to fix.

In this case, at a high level: contexts are supposed to be request-scoped, but here — and in many other places, too — the context value is long-lived. I might be missing something, but that seems to be a clear design error. Neither a state nor a BaseApp nor anything else with a lifetime beyond an individual request should maintain a context value.

More concretely: as with most types in the SDK, methods on the sdk.Context are — probably incorrectly — defined on a value receiver. That means every method call creates and operates on a (shallow) copy of the original value. This thrashes the GC, but more importantly it makes synchronization of any field with reference semantics more or less impossible. Even if you have a mutex in the type — which the context doesn't — those mutexes would get copied, and so wouldn't actually provide mutual exclusion on the values they'd be meant to protect.

And then there's all of the basically unsolvable problems created by the SDK's endemic misuse of panic as an error handling mechanism. But that's an entirely different discussion.

I don't see how to fix the data race without addressing these problems. Probably at least a few more, too.

--

I certainly haven't done a deep-dive on this code, and so I'm not speaking from an informed place. But based on what I do understand, it seems that the right approach to fixing this problem is to eliminate the state type altogether, including the deliverState and checkState fields in the BaseApp, in order to eliminate the long-lived context value. Then, look at whatever was writing-to and reading-from those state values to figure out what stuff they actually needed from the contexts. Capture that information specifically, in a separate and synchronized type, in the BaseApp struct.

Or, I dunno. Maybe I got it all wrong.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks @peterbourgon for looking over and confirming
I'll give it some thinking

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the SDK exists today, it's not meant to or designed to be executed concurrently -- we assume Tendermint places the relevant locks in it's reactors prior to executing ABCI calls, which it does. The issue arrises here, at least as far as I can tell, from direct client gRPC queries being executed while the state machine is executing ABCI call(s) that can contain various writes, which is really outside the scope or domain of Tendermint.

So while I see the idea proposed here with Context(), I don't think it's the correct approach, although I do appreciate the efforts @troian!

We need to take a step back and think of a different approach to allowing direct gRPC queries while the state machine is executing ABCI calls. For simplicity's sake, forget Tendermint even exists at this point. I think there are two ways we can protect reads and writes:

  1. Either by taking a revised approach to the use of the state context as you attempted (maybe it just needs a bit more thought), OR
  2. Using a RW mutex on BaseApp itself, where most ABCI calls use a write lock and we only obtain a read lock upon Query.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Construct a standalone app instance for quering which only shares the low-level db handler?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That also might be an option, but im not sure it'll be an app. Rather We might have to refactor Baseapp#Query.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexanderbez is this issue still a thing on later cosmos sdk 0.47+? or someone had opportunity to address it?

}

// WithContext update context of the state
func (st *state) WithContext(ctx sdk.Context) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WithX methods typically leave the receiver unmodified and return a copy with the requested changes. Should this be e.g. SetContext?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💯

st.lock.Lock()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's also add a changelog entry

defer st.lock.Unlock()
st.ctx = ctx
}