-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
tikv: invalidate store's regions when send store fail #11344
Changes from all commits
02dca27
350a1f0
94f8283
bd6385e
2887079
101483d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,13 +69,19 @@ type Region struct { | |
type RegionStore struct { | ||
workStoreIdx int32 // point to current work peer in meta.Peers and work store in stores(same idx) | ||
stores []*Store // stores in this region | ||
storeFails []uint32 // snapshots of store's fail, need reload when `storeFails[curr] != stores[cur].fail` | ||
} | ||
|
||
// clone clones region store struct. | ||
func (r *RegionStore) clone() *RegionStore { | ||
storeFails := make([]uint32, len(r.stores)) | ||
for i, e := range r.storeFails { | ||
storeFails[i] = e | ||
} | ||
return &RegionStore{ | ||
workStoreIdx: r.workStoreIdx, | ||
stores: r.stores, | ||
storeFails: storeFails, | ||
} | ||
} | ||
|
||
|
@@ -86,6 +92,7 @@ func (r *Region) init(c *RegionCache) { | |
rs := &RegionStore{ | ||
workStoreIdx: 0, | ||
stores: make([]*Store, 0, len(r.meta.Peers)), | ||
storeFails: make([]uint32, 0, len(r.meta.Peers)), | ||
} | ||
for _, p := range r.meta.Peers { | ||
c.storeMu.RLock() | ||
|
@@ -95,6 +102,7 @@ func (r *Region) init(c *RegionCache) { | |
store = c.getStoreByStoreID(p.StoreId) | ||
} | ||
rs.stores = append(rs.stores, store) | ||
rs.storeFails = append(rs.storeFails, atomic.LoadUint32(&store.fail)) | ||
} | ||
atomic.StorePointer(&r.store, unsafe.Pointer(rs)) | ||
|
||
|
@@ -272,6 +280,15 @@ func (c *RegionCache) GetRPCContext(bo *Backoffer, id RegionVerID) (*RPCContext, | |
return nil, nil | ||
} | ||
|
||
storeFailEpoch := atomic.LoadUint32(&store.fail) | ||
if storeFailEpoch != regionStore.storeFails[regionStore.workStoreIdx] { | ||
cachedRegion.invalidate() | ||
logutil.BgLogger().Info("invalidate current region, because others failed on same store", | ||
zap.Uint64("region", id.GetID()), | ||
zap.String("store", store.addr)) | ||
return nil, nil | ||
} | ||
|
||
return &RPCContext{ | ||
Region: id, | ||
Meta: cachedRegion.meta, | ||
|
@@ -368,7 +385,7 @@ func (c *RegionCache) OnSendFail(bo *Backoffer, ctx *RPCContext, scheduleReload | |
tikvRegionCacheCounterWithSendFail.Inc() | ||
r := c.getCachedRegionWithRLock(ctx.Region) | ||
if r != nil { | ||
c.switchNextPeer(r, ctx.PeerIdx) | ||
c.switchNextPeer(r, ctx.PeerIdx, err) | ||
if scheduleReload { | ||
r.scheduleReload() | ||
} | ||
|
@@ -523,7 +540,7 @@ func (c *RegionCache) UpdateLeader(regionID RegionVerID, leaderStoreID uint64, c | |
} | ||
|
||
if leaderStoreID == 0 { | ||
c.switchNextPeer(r, currentPeerIdx) | ||
c.switchNextPeer(r, currentPeerIdx, nil) | ||
logutil.BgLogger().Info("switch region peer to next due to NotLeader with NULL leader", | ||
zap.Int("currIdx", currentPeerIdx), | ||
zap.Uint64("regionID", regionID.GetID())) | ||
|
@@ -939,15 +956,24 @@ func (c *RegionCache) switchToPeer(r *Region, targetStoreID uint64) (found bool) | |
return | ||
} | ||
|
||
func (c *RegionCache) switchNextPeer(r *Region, currentPeerIdx int) { | ||
regionStore := r.getStore() | ||
if int(regionStore.workStoreIdx) != currentPeerIdx { | ||
func (c *RegionCache) switchNextPeer(r *Region, currentPeerIdx int, err error) { | ||
rs := r.getStore() | ||
if int(rs.workStoreIdx) != currentPeerIdx { | ||
return | ||
} | ||
nextIdx := (currentPeerIdx + 1) % len(regionStore.stores) | ||
newRegionStore := regionStore.clone() | ||
|
||
if err != nil { // TODO: refine err, only do this for some errors. | ||
s := rs.stores[rs.workStoreIdx] | ||
epoch := rs.storeFails[rs.workStoreIdx] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, but this no need so https://github.com/pingcap/tidb/pull/11344/files#diff-708f6242b27e2b7bcf0e905e9b0eacf2R973 has bug, and we should not +1 for |
||
if atomic.CompareAndSwapUint32(&s.fail, epoch, epoch+1) { | ||
logutil.BgLogger().Info("mark store's regions need be refill", zap.String("store", s.addr)) | ||
} | ||
} | ||
|
||
nextIdx := (currentPeerIdx + 1) % len(rs.stores) | ||
newRegionStore := rs.clone() | ||
newRegionStore.workStoreIdx = int32(nextIdx) | ||
r.compareAndSwapStore(regionStore, newRegionStore) | ||
r.compareAndSwapStore(rs, newRegionStore) | ||
} | ||
|
||
func (c *RegionCache) getPeerStoreIndex(r *Region, id uint64) (idx int, found bool) { | ||
|
@@ -1000,6 +1026,7 @@ type Store struct { | |
storeID uint64 // store's id | ||
state uint64 // unsafe store storeState | ||
resolveMutex sync.Mutex // protect pd from concurrent init requests | ||
fail uint32 // store fail count, see RegionStore.storeFails | ||
} | ||
|
||
type resolveState uint64 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can create a new error type
ReConnectionFailure
, and only invalid regions for only such error.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, I'm looking into how go's network packet generate error, after that we can fix this TODO https://github.com/pingcap/tidb/pull/11344/files#diff-708f6242b27e2b7bcf0e905e9b0eacf2R965