diff --git a/arbnode/dataposter/data_poster.go b/arbnode/dataposter/data_poster.go index acbf9c4cc8..373d247696 100644 --- a/arbnode/dataposter/data_poster.go +++ b/arbnode/dataposter/data_poster.go @@ -34,7 +34,6 @@ import ( "github.com/ethereum/go-ethereum/rlp" "github.com/ethereum/go-ethereum/rpc" "github.com/ethereum/go-ethereum/signer/core/apitypes" - "github.com/go-redis/redis/v8" "github.com/holiman/uint256" "github.com/offchainlabs/nitro/arbnode/dataposter/dbstorage" "github.com/offchainlabs/nitro/arbnode/dataposter/noop" @@ -46,6 +45,7 @@ import ( "github.com/offchainlabs/nitro/util/rpcclient" "github.com/offchainlabs/nitro/util/signature" "github.com/offchainlabs/nitro/util/stopwaiter" + "github.com/redis/go-redis/v9" "github.com/spf13/pflag" redisstorage "github.com/offchainlabs/nitro/arbnode/dataposter/redis" diff --git a/arbnode/dataposter/redis/redisstorage.go b/arbnode/dataposter/redis/redisstorage.go index 8b6dcf65ac..b54abf618b 100644 --- a/arbnode/dataposter/redis/redisstorage.go +++ b/arbnode/dataposter/redis/redisstorage.go @@ -9,9 +9,9 @@ import ( "errors" "fmt" - "github.com/go-redis/redis/v8" "github.com/offchainlabs/nitro/arbnode/dataposter/storage" "github.com/offchainlabs/nitro/util/signature" + "github.com/redis/go-redis/v9" ) // Storage implements redis sorted set backed storage. It does not support @@ -196,7 +196,7 @@ func (s *Storage) Put(ctx context.Context, index uint64, prev, new *storage.Queu if err != nil { return err } - if err := pipe.ZAdd(ctx, s.key, &redis.Z{ + if err := pipe.ZAdd(ctx, s.key, redis.Z{ Score: float64(index), Member: string(signedItem), }).Err(); err != nil { diff --git a/arbnode/redislock/redis.go b/arbnode/redislock/redis.go index 7e26010cae..de9508323a 100644 --- a/arbnode/redislock/redis.go +++ b/arbnode/redislock/redis.go @@ -12,8 +12,8 @@ import ( "time" "github.com/ethereum/go-ethereum/log" - "github.com/go-redis/redis/v8" "github.com/offchainlabs/nitro/util/stopwaiter" + "github.com/redis/go-redis/v9" flag "github.com/spf13/pflag" ) diff --git a/arbnode/seq_coordinator.go b/arbnode/seq_coordinator.go index 176ace114b..d22b4f7491 100644 --- a/arbnode/seq_coordinator.go +++ b/arbnode/seq_coordinator.go @@ -14,7 +14,7 @@ import ( "sync/atomic" "time" - "github.com/go-redis/redis/v8" + "github.com/redis/go-redis/v9" flag "github.com/spf13/pflag" "github.com/ethereum/go-ethereum/log" diff --git a/cmd/seq-coordinator-manager/rediscoordinator/redis_coordinator.go b/cmd/seq-coordinator-manager/rediscoordinator/redis_coordinator.go index e963c0e96c..b897b23252 100644 --- a/cmd/seq-coordinator-manager/rediscoordinator/redis_coordinator.go +++ b/cmd/seq-coordinator-manager/rediscoordinator/redis_coordinator.go @@ -5,8 +5,8 @@ import ( "errors" "strings" - "github.com/go-redis/redis/v8" "github.com/offchainlabs/nitro/util/redisutil" + "github.com/redis/go-redis/v9" ) // RedisCoordinator builds upon RedisCoordinator of redisutil with additional functionality diff --git a/das/redis_storage_service.go b/das/redis_storage_service.go index 210d5cb2d4..e57240992c 100644 --- a/das/redis_storage_service.go +++ b/das/redis_storage_service.go @@ -12,11 +12,11 @@ import ( "golang.org/x/crypto/sha3" - "github.com/go-redis/redis/v8" "github.com/offchainlabs/nitro/arbstate/daprovider" "github.com/offchainlabs/nitro/das/dastree" "github.com/offchainlabs/nitro/util/pretty" "github.com/offchainlabs/nitro/util/redisutil" + "github.com/redis/go-redis/v9" flag "github.com/spf13/pflag" "github.com/ethereum/go-ethereum/common" diff --git a/go.mod b/go.mod index 18e3a8b02a..488d455f44 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,6 @@ require ( github.com/ethereum/go-ethereum v1.10.26 github.com/fatih/structtag v1.2.0 github.com/gdamore/tcell/v2 v2.7.1 - github.com/go-redis/redis/v8 v8.11.5 github.com/gobwas/httphead v0.1.0 github.com/gobwas/ws v1.2.1 github.com/gobwas/ws-examples v0.0.0-20190625122829-a9e8908d9484 @@ -39,6 +38,7 @@ require ( github.com/mitchellh/mapstructure v1.4.1 github.com/pkg/errors v0.9.1 github.com/r3labs/diff/v3 v3.0.1 + github.com/redis/go-redis/v9 v9.6.1 github.com/rivo/tview v0.0.0-20240307173318-e804876934a1 github.com/spf13/pflag v1.0.5 github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 @@ -99,7 +99,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/bits-and-blooms/bitset v1.10.0 // indirect github.com/btcsuite/btcd/btcec/v2 v2.2.0 // indirect - github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cockroachdb/errors v1.9.1 // indirect github.com/cockroachdb/logtags v0.0.0-20211118104740-dabe8e521a4f // indirect github.com/cockroachdb/redact v1.1.3 // indirect diff --git a/go.sum b/go.sum index f848c2aa2d..d11610724e 100644 --- a/go.sum +++ b/go.sum @@ -139,6 +139,10 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bits-and-blooms/bitset v1.10.0 h1:ePXTeiPEazB5+opbv5fr8umg2R/1NlzgDsyepwsSr88= github.com/bits-and-blooms/bitset v1.10.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= github.com/btcsuite/btcd/btcec/v2 v2.2.0 h1:fzn1qaOt32TuLjFlkzYSsBC35Q3KUjT1SwPxiMSCF5k= github.com/btcsuite/btcd/btcec/v2 v2.2.0/go.mod h1:U7MHm051Al6XmscBQ0BoNydpOTsFAn707034b5nY8zU= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1 h1:q0rUy8C/TYNBQS1+CGKw68tLOFYSNEs0TFnxxnS9+4U= @@ -150,8 +154,8 @@ github.com/cespare/cp v0.1.0 h1:SE+dxFebS7Iik5LK0tsi1k9ZCxEaFX4AjQmoyA+1dJk= github.com/cespare/cp v0.1.0/go.mod h1:SOGHArjBr4JWaSDEVpWpo/hNg6RoKrls6Oh40hiwW+s= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= -github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/logex v1.2.0/go.mod h1:9+9sk7u7pGNWYMkh0hdiL++6OeibzJccyQU4p4MedaY= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= @@ -281,8 +285,6 @@ github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab/go.mod h1:/P9AE github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= -github.com/go-redis/redis/v8 v8.11.5 h1:AcZZR7igkdvfVmQTPnu9WE37LRrO/YrBH5zWyjDC0oI= -github.com/go-redis/redis/v8 v8.11.5/go.mod h1:gREzHqY1hg6oD9ngVRbLStwAWKhA0FEgq8Jd4h5lpwo= github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU= github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= @@ -582,22 +584,19 @@ github.com/nats-io/nkeys v0.1.0/go.mod h1:xpnFELMwJABBLVhffcfd1MZx6VsNRFpEugbxzi github.com/nats-io/nuid v1.0.1/go.mod h1:19wcPz3Ph3q0Jbyiqsd0kePYG7A95tJPxeL+1OSON2c= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/npillmayer/nestext v0.1.3/go.mod h1:h2lrijH8jpicr25dFY+oAJLyzlya6jhnuG+zWp9L0Uk= +github.com/nxadm/tail v1.4.4 h1:DQuhQpB1tVlglWS2hLQ5OV6B5r8aGxSrPc5Qo6uTN78= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= -github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= -github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/oklog/run v1.0.0/go.mod h1:dlhp/R75TPv97u0XWUtDeV/lRKWPKSdTuV0TZvrmrQA= github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec= github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.10.3/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= +github.com/onsi/ginkgo v1.14.0 h1:2mOpI4JVVPBN+WQRa0WKH2eXR+Ey+uK4n7Zj0aYpIQA= github.com/onsi/ginkgo v1.14.0/go.mod h1:iSB4RoI2tjJc9BBv4NKIKWKya62Rps+oPG/Lv9klQyY= -github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= -github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= +github.com/onsi/gomega v1.10.1 h1:o0+MgICZLuZ7xjH7Vx6zS/zcu93/BEp1VwkIW1mEXCE= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= -github.com/onsi/gomega v1.18.1 h1:M1GfJqGRrBrrGGsbxzV5dqM2U2ApXefZCQpkukxYRLE= -github.com/onsi/gomega v1.18.1/go.mod h1:0q+aL8jAiMXy9hbwj2mr5GziHiwhAIQpFmmtT5hitRs= github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsqf19k25Ur8rU= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= @@ -643,6 +642,8 @@ github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5 github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= github.com/r3labs/diff/v3 v3.0.1 h1:CBKqf3XmNRHXKmdU7mZP1w7TV0pDyVCis1AUHtA4Xtg= github.com/r3labs/diff/v3 v3.0.1/go.mod h1:f1S9bourRbiM66NskseyUdo0fTmEE0qKrikYJX63dgo= +github.com/redis/go-redis/v9 v9.6.1 h1:HHDteefn6ZkTtY5fGUE8tj8uy85AHk6zP7CpzIAM0y4= +github.com/redis/go-redis/v9 v9.6.1/go.mod h1:0C0c6ycQsdpVNQpxb1njEQIqkx5UcsM8FJCQLgE9+RA= github.com/rhnvrm/simples3 v0.6.1 h1:H0DJwybR6ryQE+Odi9eqkHuzjYAeJgtGcGtuBwOhsH8= github.com/rhnvrm/simples3 v0.6.1/go.mod h1:Y+3vYm2V7Y4VijFoJHHTrja6OgPrJ2cBti8dPGkC3sA= github.com/rivo/tview v0.0.0-20240307173318-e804876934a1 h1:bWLHTRekAy497pE7+nXSuzXwwFHI0XauRzz6roUvY+s= @@ -1042,6 +1043,7 @@ golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxb golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= diff --git a/pubsub/common.go b/pubsub/common.go index d7f041af15..ad36b6e622 100644 --- a/pubsub/common.go +++ b/pubsub/common.go @@ -2,12 +2,15 @@ package pubsub import ( "context" + "fmt" "strings" "github.com/ethereum/go-ethereum/log" - "github.com/go-redis/redis/v8" + "github.com/redis/go-redis/v9" ) +func ResultKeyFor(streamName, id string) string { return fmt.Sprintf("%s.%s", streamName, id) } + // CreateStream tries to create stream with given name, if it already exists // does not return an error. func CreateStream(ctx context.Context, streamName string, client redis.UniversalClient) error { diff --git a/pubsub/consumer.go b/pubsub/consumer.go index bd73e729e7..391042bd7e 100644 --- a/pubsub/consumer.go +++ b/pubsub/consumer.go @@ -5,36 +5,38 @@ import ( "encoding/json" "errors" "fmt" + "math" + "math/rand" + "strconv" "time" "github.com/ethereum/go-ethereum/log" - "github.com/go-redis/redis/v8" "github.com/google/uuid" "github.com/offchainlabs/nitro/util/stopwaiter" + "github.com/redis/go-redis/v9" "github.com/spf13/pflag" ) type ConsumerConfig struct { // Timeout of result entry in Redis. ResponseEntryTimeout time.Duration `koanf:"response-entry-timeout"` - // Duration after which consumer is considered to be dead if heartbeat - // is not updated. - KeepAliveTimeout time.Duration `koanf:"keepalive-timeout"` + // Minimum idle time after which messages will be autoclaimed + IdletimeToAutoclaim time.Duration `koanf:"idletime-to-autoclaim"` } var DefaultConsumerConfig = ConsumerConfig{ ResponseEntryTimeout: time.Hour, - KeepAliveTimeout: 5 * time.Minute, + IdletimeToAutoclaim: 5 * time.Minute, } var TestConsumerConfig = ConsumerConfig{ ResponseEntryTimeout: time.Minute, - KeepAliveTimeout: 30 * time.Millisecond, + IdletimeToAutoclaim: 30 * time.Millisecond, } func ConsumerConfigAddOptions(prefix string, f *pflag.FlagSet) { f.Duration(prefix+".response-entry-timeout", DefaultConsumerConfig.ResponseEntryTimeout, "timeout for response entry") - f.Duration(prefix+".keepalive-timeout", DefaultConsumerConfig.KeepAliveTimeout, "timeout after which consumer is considered inactive if heartbeat wasn't performed") + f.Duration(prefix+".idletime-to-autoclaim", DefaultConsumerConfig.IdletimeToAutoclaim, "After a message spends this amount of time in PEL (Pending Entries List i.e claimed by another consumer but not Acknowledged) it will be allowed to be autoclaimed by other consumers") } // Consumer implements a consumer for redis stream provides heartbeat to @@ -51,6 +53,7 @@ type Consumer[Request any, Response any] struct { type Message[Request any] struct { ID string Value Request + Ack func() } func NewConsumer[Request any, Response any](client redis.UniversalClient, streamName string, cfg *ConsumerConfig) (*Consumer[Request, Response], error) { @@ -69,12 +72,6 @@ func NewConsumer[Request any, Response any](client redis.UniversalClient, stream // Start starts the consumer to iteratively perform heartbeat in configured intervals. func (c *Consumer[Request, Response]) Start(ctx context.Context) { c.StopWaiter.Start(ctx, c) - c.StopWaiter.CallIteratively( - func(ctx context.Context) time.Duration { - c.heartBeat(ctx) - return c.cfg.KeepAliveTimeout / 10 - }, - ) } func (c *Consumer[Request, Response]) Id() string { @@ -83,11 +80,6 @@ func (c *Consumer[Request, Response]) Id() string { func (c *Consumer[Request, Response]) StopAndWait() { c.StopWaiter.StopAndWait() - c.deleteHeartBeat(c.GetParentContext()) -} - -func heartBeatKey(id string) string { - return fmt.Sprintf("consumer:%s:heartbeat", id) } func (c *Consumer[Request, Response]) RedisClient() redis.UniversalClient { @@ -98,68 +90,124 @@ func (c *Consumer[Request, Response]) StreamName() string { return c.redisStream } -func (c *Consumer[Request, Response]) heartBeatKey() string { - return heartBeatKey(c.id) -} - -// deleteHeartBeat deletes the heartbeat to indicate it is being shut down. -func (c *Consumer[Request, Response]) deleteHeartBeat(ctx context.Context) { - if err := c.client.Del(ctx, c.heartBeatKey()).Err(); err != nil { - l := log.Info - if ctx.Err() != nil { - l = log.Error - } - l("Deleting heardbeat", "consumer", c.id, "error", err) +func decrementMsgIdByOne(msgId string) string { + id, err := getUintParts(msgId) + if err != nil { + log.Error("Error decrementing start of XAutoClaim by one, defaulting to 0", "err", err) + return "0" } -} - -// heartBeat updates the heartBeat key indicating aliveness. -func (c *Consumer[Request, Response]) heartBeat(ctx context.Context) { - if err := c.client.Set(ctx, c.heartBeatKey(), time.Now().UnixMilli(), 2*c.cfg.KeepAliveTimeout).Err(); err != nil { - l := log.Info - if ctx.Err() != nil { - l = log.Error - } - l("Updating heardbeat", "consumer", c.id, "error", err) + if id[1] > 0 { + return strconv.FormatUint(id[0], 10) + "-" + strconv.FormatUint(id[1]-1, 10) + } else if id[0] > 0 { + return strconv.FormatUint(id[0]-1, 10) + "-" + strconv.FormatUint(math.MaxUint64, 10) } + return "0" } // Consumer first checks it there exists pending message that is claimed by // unresponsive consumer, if not then reads from the stream. func (c *Consumer[Request, Response]) Consume(ctx context.Context) (*Message[Request], error) { - res, err := c.client.XReadGroup(ctx, &redis.XReadGroupArgs{ - Group: c.redisGroup, - Consumer: c.id, - // Receive only messages that were never delivered to any other consumer, - // that is, only new messages. - Streams: []string{c.redisStream, ">"}, - Count: 1, - Block: time.Millisecond, // 0 seems to block the read instead of immediately returning - }).Result() - if errors.Is(err, redis.Nil) { - return nil, nil - } - if err != nil { - return nil, fmt.Errorf("reading message for consumer: %q: %w", c.id, err) + // First try to XAUTOCLAIM, with start as a random messageID from PEL with MinIdle as IdletimeToAutoclaim + // this prioritizes processing PEL messages that have been waiting for more than IdletimeToAutoclaim duration + var messages []redis.XMessage + if pendingMsgs, err := c.client.XPendingExt(ctx, &redis.XPendingExtArgs{ + Stream: c.redisStream, + Group: c.redisGroup, + Start: "-", + End: "+", + Count: 50, + Idle: c.cfg.IdletimeToAutoclaim, + }).Result(); err != nil { + if !errors.Is(err, redis.Nil) { + log.Error("Error from XpendingExt in getting PEL for auto claim", "err", err, "penindlen", len(pendingMsgs)) + } + } else if len(pendingMsgs) > 0 { + idx := rand.Intn(len(pendingMsgs)) + messages, _, err = c.client.XAutoClaim(ctx, &redis.XAutoClaimArgs{ + Group: c.redisGroup, + Consumer: c.id, + MinIdle: c.cfg.IdletimeToAutoclaim, // Minimum idle time for messages to claim (in milliseconds) + Stream: c.redisStream, + Start: decrementMsgIdByOne(pendingMsgs[idx].ID), + Count: 1, + }).Result() + if err != nil { + log.Info("error from xautoclaim", "err", err) + } } - if len(res) != 1 || len(res[0].Messages) != 1 { - return nil, fmt.Errorf("redis returned entries: %+v, for querying single message", res) + if len(messages) == 0 { + // If we fail to autoclaim then we do not retry but instead fallback to reading new messages + res, err := c.client.XReadGroup(ctx, &redis.XReadGroupArgs{ + Group: c.redisGroup, + Consumer: c.id, + // Receive only messages that were never delivered to any other consumer, + // that is, only new messages. + Streams: []string{c.redisStream, ">"}, + Count: 1, + Block: time.Millisecond, // 0 seems to block the read instead of immediately returning + }).Result() + if errors.Is(err, redis.Nil) { + return nil, nil + } + if err != nil { + return nil, fmt.Errorf("reading message for consumer: %q: %w", c.id, err) + } + if len(res) != 1 || len(res[0].Messages) != 1 { + return nil, fmt.Errorf("redis returned entries: %+v, for querying single message", res) + } + messages = res[0].Messages } + var ( - value = res[0].Messages[0].Values[messageKey] + value = messages[0].Values[messageKey] data, ok = (value).(string) ) if !ok { - return nil, fmt.Errorf("casting request to string: %w", err) + return nil, errors.New("error casting request to string") } var req Request if err := json.Unmarshal([]byte(data), &req); err != nil { return nil, fmt.Errorf("unmarshaling value: %v, error: %w", value, err) } - log.Debug("Redis stream consuming", "consumer_id", c.id, "message_id", res[0].Messages[0].ID) + ackNotifier := make(chan struct{}) + c.StopWaiter.LaunchThread(func(ctx context.Context) { + for { + // Use XClaimJustID so that we would have clear difference between invalid requests that are claimed multiple times due to xautoclaim and + // valid requests that are just being claimed in regular intervals to indicate heartbeat + if ids, err := c.client.XClaimJustID(ctx, &redis.XClaimArgs{ + Stream: c.redisStream, + Group: c.redisGroup, + Consumer: c.id, + MinIdle: 0, + Messages: []string{messages[0].ID}, + }).Result(); err != nil { + log.Error("Error claiming message, it might be possible that other consumers might pick this request", "msgID", messages[0].ID) + } else if len(ids) == 0 { + log.Warn("XClaimJustID returned empty response when indicating hearbeat", "msgID", messages[0].ID) + } else if len(ids) > 1 { + log.Error("XClaimJustID returned response with more than entry", "msgIDs", ids) + } + select { + case <-ackNotifier: + return + case <-ctx.Done(): + log.Info("Context done while claiming message to indicate hearbeat", "messageID", messages[0].ID, "error", ctx.Err().Error()) + if c.StopWaiter.GetParentContext().Err() == nil { + // Proceeding to set the Idle time of message to IdletimeToAutoclaim to allow it to be picked by other consumers + if err := c.client.Do(c.StopWaiter.GetParentContext(), "XCLAIM", c.redisStream, c.redisGroup, c.id, 0, messages[0].ID, "IDLE", c.cfg.IdletimeToAutoclaim.Milliseconds()).Err(); err != nil { + log.Error("error when trying to set the idle time of currently worked on message to IdletimeToAutoclaim", "messageID", messages[0].ID, "err", err) + } + } + return + case <-time.After(c.cfg.IdletimeToAutoclaim / 10): + } + } + }) + log.Debug("Redis stream consuming", "consumer_id", c.id, "message_id", messages[0].ID) return &Message[Request]{ - ID: res[0].Messages[0].ID, + ID: messages[0].ID, Value: req, + Ack: func() { close(ackNotifier) }, }, nil } @@ -168,14 +216,18 @@ func (c *Consumer[Request, Response]) SetResult(ctx context.Context, messageID s if err != nil { return fmt.Errorf("marshaling result: %w", err) } - log.Debug("consumer: setting result", "cid", c.id, "messageId", messageID) - acquired, err := c.client.SetNX(ctx, messageID, resp, c.cfg.ResponseEntryTimeout).Result() + resultKey := ResultKeyFor(c.StreamName(), messageID) + log.Debug("consumer: setting result", "cid", c.id, "msgIdInStream", messageID, "resultKeyInRedis", resultKey) + acquired, err := c.client.SetNX(ctx, resultKey, resp, c.cfg.ResponseEntryTimeout).Result() if err != nil || !acquired { - return fmt.Errorf("setting result for message: %v, error: %w", messageID, err) + return fmt.Errorf("setting result for message with message-id in stream: %v, error: %w", messageID, err) } log.Debug("consumer: xack", "cid", c.id, "messageId", messageID) if _, err := c.client.XAck(ctx, c.redisStream, c.redisGroup, messageID).Result(); err != nil { return fmt.Errorf("acking message: %v, error: %w", messageID, err) } + if _, err := c.client.XDel(ctx, c.redisStream, messageID).Result(); err != nil { + return fmt.Errorf("deleting message: %v, error: %w", messageID, err) + } return nil } diff --git a/pubsub/producer.go b/pubsub/producer.go index 5eec3a4b52..dacaeba7d0 100644 --- a/pubsub/producer.go +++ b/pubsub/producer.go @@ -13,17 +13,16 @@ import ( "encoding/json" "errors" "fmt" - "math" "strconv" "strings" "sync" "time" "github.com/ethereum/go-ethereum/log" - "github.com/go-redis/redis/v8" "github.com/google/uuid" "github.com/offchainlabs/nitro/util/containers" "github.com/offchainlabs/nitro/util/stopwaiter" + "github.com/redis/go-redis/v9" "github.com/spf13/pflag" ) @@ -43,50 +42,31 @@ type Producer[Request any, Response any] struct { promisesLock sync.RWMutex promises map[string]*containers.Promise[Response] - // Used for running checks for pending messages with inactive consumers - // and checking responses from consumers iteratively for the first time when - // Produce is called. + // Used for checking responses from consumers iteratively + // For the first time when Produce is called. once sync.Once } type ProducerConfig struct { - // When enabled, messages that are sent to consumers that later die before - // processing them, will be re-inserted into the stream to be proceesed by - // another consumer - EnableReproduce bool `koanf:"enable-reproduce"` - // Interval duration in which producer checks for pending messages delivered - // to the consumers that are currently inactive. - CheckPendingInterval time.Duration `koanf:"check-pending-interval"` - // Duration after which consumer is considered to be dead if heartbeat - // is not updated. - KeepAliveTimeout time.Duration `koanf:"keepalive-timeout"` // Interval duration for checking the result set by consumers. CheckResultInterval time.Duration `koanf:"check-result-interval"` - CheckPendingItems int64 `koanf:"check-pending-items"` + // RequestTimeout is a TTL for any message sent to the redis stream + RequestTimeout time.Duration `koanf:"request-timeout"` } var DefaultProducerConfig = ProducerConfig{ - EnableReproduce: true, - CheckPendingInterval: time.Second, - KeepAliveTimeout: 5 * time.Minute, - CheckResultInterval: 5 * time.Second, - CheckPendingItems: 256, + CheckResultInterval: 5 * time.Second, + RequestTimeout: 3 * time.Hour, } var TestProducerConfig = ProducerConfig{ - EnableReproduce: false, - CheckPendingInterval: 10 * time.Millisecond, - KeepAliveTimeout: 100 * time.Millisecond, - CheckResultInterval: 5 * time.Millisecond, - CheckPendingItems: 256, + CheckResultInterval: 5 * time.Millisecond, + RequestTimeout: time.Minute, } func ProducerAddConfigAddOptions(prefix string, f *pflag.FlagSet) { - f.Bool(prefix+".enable-reproduce", DefaultProducerConfig.EnableReproduce, "when enabled, messages with dead consumer will be re-inserted into the stream") - f.Duration(prefix+".check-pending-interval", DefaultProducerConfig.CheckPendingInterval, "interval in which producer checks pending messages whether consumer processing them is inactive") f.Duration(prefix+".check-result-interval", DefaultProducerConfig.CheckResultInterval, "interval in which producer checks pending messages whether consumer processing them is inactive") - f.Duration(prefix+".keepalive-timeout", DefaultProducerConfig.KeepAliveTimeout, "timeout after which consumer is considered inactive if heartbeat wasn't performed") - f.Int64(prefix+".check-pending-items", DefaultProducerConfig.CheckPendingItems, "items to screen during check-pending") + f.Duration(prefix+".request-timeout", DefaultProducerConfig.RequestTimeout, "timeout after which the message in redis stream is considered as errored, this prevents workers from working on wrong requests indefinitely") } func NewProducer[Request any, Response any](client redis.UniversalClient, streamName string, cfg *ProducerConfig) (*Producer[Request, Response], error) { @@ -106,125 +86,72 @@ func NewProducer[Request any, Response any](client redis.UniversalClient, stream }, nil } -func (p *Producer[Request, Response]) errorPromisesFor(msgIds []string) { - p.promisesLock.Lock() - defer p.promisesLock.Unlock() - for _, msg := range msgIds { - if promise, found := p.promises[msg]; found { - promise.ProduceError(fmt.Errorf("internal error, consumer died while serving the request")) - delete(p.promises, msg) - } +func getUintParts(msgId string) ([2]uint64, error) { + idParts := strings.Split(msgId, "-") + if len(idParts) != 2 { + return [2]uint64{}, fmt.Errorf("invalid i.d: %v", msgId) } -} - -// checkAndReproduce reproduce pending messages that were sent to consumers -// that are currently inactive. -func (p *Producer[Request, Response]) checkAndReproduce(ctx context.Context) time.Duration { - staleIds, err := p.checkPending(ctx) + idTimeStamp, err := strconv.ParseUint(idParts[0], 10, 64) if err != nil { - log.Error("Checking pending messages", "error", err) - return p.cfg.CheckPendingInterval - } - if len(staleIds) == 0 { - return p.cfg.CheckPendingInterval + return [2]uint64{}, fmt.Errorf("invalid i.d: %v err: %w", msgId, err) } - if p.cfg.EnableReproduce { - err = p.reproduceIds(ctx, staleIds) - if err != nil { - log.Warn("filed reproducing messages", "err", err) - } - } else { - p.errorPromisesFor(staleIds) - } - return p.cfg.CheckPendingInterval -} - -func (p *Producer[Request, Response]) reproduceIds(ctx context.Context, staleIds []string) error { - log.Info("Attempting to claim", "messages", staleIds) - claimedMsgs, err := p.client.XClaim(ctx, &redis.XClaimArgs{ - Stream: p.redisStream, - Group: p.redisGroup, - Consumer: p.id, - MinIdle: p.cfg.KeepAliveTimeout, - Messages: staleIds, - }).Result() + idSerial, err := strconv.ParseUint(idParts[1], 10, 64) if err != nil { - return fmt.Errorf("claiming ownership on messages: %v, error: %w", staleIds, err) + return [2]uint64{}, fmt.Errorf("invalid i.d serial: %v err: %w", msgId, err) } - for _, msg := range claimedMsgs { - data, ok := (msg.Values[messageKey]).(string) - if !ok { - log.Error("redis producer reproduce: message not string", "id", msg.ID, "value", msg.Values[messageKey]) - continue - } - var req Request - if err := json.Unmarshal([]byte(data), &req); err != nil { - log.Error("redis producer reproduce: message not a request", "id", msg.ID, "err", err, "value", msg.Values[messageKey]) - continue - } - if _, err := p.client.XAck(ctx, p.redisStream, p.redisGroup, msg.ID).Result(); err != nil { - log.Error("redis producer reproduce: could not ACK", "id", msg.ID, "err", err) - continue - } - // Only re-insert messages that were removed the the pending list first. - if _, err := p.reproduce(ctx, req, msg.ID); err != nil { - log.Error("redis producer reproduce: error", "err", err) - } - } - return nil + return [2]uint64{idTimeStamp, idSerial}, nil } -func setMinIdInt(min *[2]uint64, id string) error { - idParts := strings.Split(id, "-") - if len(idParts) != 2 { - return fmt.Errorf("invalid i.d: %v", id) - } - idTimeStamp, err := strconv.ParseUint(idParts[0], 10, 64) +// cmpMsgId compares two msgid's and returns (0) if equal, (-1) if msgId1 < msgId2, (1) if msgId1 > msgId2, (-2) if not comparable (or error) +func cmpMsgId(msgId1, msgId2 string) int { + id1, err := getUintParts(msgId1) if err != nil { - return fmt.Errorf("invalid i.d: %v err: %w", id, err) + log.Trace("error comparing msgIds", "msgId1", msgId1, "msgId2", msgId2) + return -2 } - if idTimeStamp > min[0] { - return nil - } - idSerial, err := strconv.ParseUint(idParts[1], 10, 64) + id2, err := getUintParts(msgId2) if err != nil { - return fmt.Errorf("invalid i.d serial: %v err: %w", id, err) + log.Trace("error comparing msgIds", "msgId1", msgId1, "msgId2", msgId2) + return -2 } - if idTimeStamp < min[0] { - min[0] = idTimeStamp - min[1] = idSerial - return nil + if id1[0] < id2[0] { + return -1 + } else if id1[0] > id2[0] { + return 1 + } else if id1[1] < id2[1] { + return -1 + } else if id1[1] > id2[1] { + return 1 } - // idTimeStamp == min[0] - if idSerial < min[1] { - min[1] = idSerial - } - return nil + return 0 } // checkResponses checks iteratively whether response for the promise is ready. func (p *Producer[Request, Response]) checkResponses(ctx context.Context) time.Duration { - minIdInt := [2]uint64{math.MaxUint64, math.MaxUint64} log.Debug("redis producer: check responses starting") p.promisesLock.Lock() defer p.promisesLock.Unlock() responded := 0 errored := 0 checked := 0 + allowedOldestID := fmt.Sprintf("%d-0", time.Now().Add(-p.cfg.RequestTimeout).UnixMilli()) for id, promise := range p.promises { if ctx.Err() != nil { return 0 } checked++ - res, err := p.client.Get(ctx, id).Result() + resultKey := ResultKeyFor(p.redisStream, id) + res, err := p.client.Get(ctx, resultKey).Result() if err != nil { - errSetId := setMinIdInt(&minIdInt, id) - if errSetId != nil { - log.Error("redis producer: error setting minId", "err", err) - return p.cfg.CheckResultInterval - } if !errors.Is(err, redis.Nil) { - log.Error("redis producer: Error reading value in redis", "key", id, "error", err) + log.Error("Error reading value in redis", "key", resultKey, "error", err) + } else if cmpMsgId(id, allowedOldestID) == -1 { + // The request this producer is waiting for has been past its TTL or is older than current PEL's lower, + // so safe to error and stop tracking this promise + promise.ProduceError(errors.New("error getting response, request has been waiting for too long")) + log.Error("error getting response, request has been waiting past its TTL") + errored++ + delete(p.promises, id) } continue } @@ -237,22 +164,50 @@ func (p *Producer[Request, Response]) checkResponses(ctx context.Context) time.D promise.Produce(resp) responded++ } - p.client.Del(ctx, id) + p.client.Del(ctx, resultKey) delete(p.promises, id) } - var trimmed int64 - var trimErr error - minId := "+" - if minIdInt[0] < math.MaxUint64 { - minId = fmt.Sprintf("%d-%d", minIdInt[0], minIdInt[1]) - trimmed, trimErr = p.client.XTrimMinID(ctx, p.redisStream, minId).Result() - } else { - trimmed, trimErr = p.client.XTrimMaxLen(ctx, p.redisStream, 0).Result() - } - log.Debug("trimming", "id", minId, "trimmed", trimmed, "responded", responded, "errored", errored, "trim-err", trimErr, "checked", checked) + log.Debug("checkResponses", "responded", responded, "errored", errored, "checked", checked) return p.cfg.CheckResultInterval } +func (p *Producer[Request, Response]) clearMessages(ctx context.Context) time.Duration { + pelData, err := p.client.XPending(ctx, p.redisStream, p.redisGroup).Result() + if err != nil { + log.Error("error getting PEL data from xpending, xtrimming is disabled", "err", err) + } + // XDEL on consumer side already deletes acked messages (mark as deleted) but doesnt claim the memory back, XTRIM helps in claiming this memory in normal conditions + // pelData might be outdated when we do the xtrim, but thats ok as the messages are also being trimmed by other producers + if pelData != nil && pelData.Lower != "" { + trimmed, trimErr := p.client.XTrimMinID(ctx, p.redisStream, pelData.Lower).Result() + log.Debug("trimming", "xTrimMinID", pelData.Lower, "trimmed", trimmed, "trim-err", trimErr) + // Check if pelData.Lower has been past its TTL and if it is then ack it to remove from PEL and delete it, once + // its taken out from PEL the producer that sent this request will handle the corresponding promise accordingly (as its past TTL) + allowedOldestID := fmt.Sprintf("%d-0", time.Now().Add(-p.cfg.RequestTimeout).UnixMilli()) + if cmpMsgId(pelData.Lower, allowedOldestID) == -1 { + if err := p.client.XClaim(ctx, &redis.XClaimArgs{ + Stream: p.redisStream, + Group: p.redisGroup, + Consumer: p.id, + MinIdle: 0, + Messages: []string{pelData.Lower}, + }).Err(); err != nil { + log.Error("error claiming PEL's lower message thats past its TTL", "msgID", pelData.Lower, "err", err) + return 5 * p.cfg.CheckResultInterval + } + if _, err := p.client.XAck(ctx, p.redisStream, p.redisGroup, pelData.Lower).Result(); err != nil { + log.Error("error acking PEL's lower message thats past its TTL", "msgID", pelData.Lower, "err", err) + return 5 * p.cfg.CheckResultInterval + } + if _, err := p.client.XDel(ctx, p.redisStream, pelData.Lower).Result(); err != nil { + log.Error("error deleting PEL's lower message thats past its TTL", "msgID", pelData.Lower, "err", err) + return 0 + } + } + } + return 5 * p.cfg.CheckResultInterval +} + func (p *Producer[Request, Response]) Start(ctx context.Context) { p.StopWaiter.Start(ctx, p) } @@ -263,101 +218,31 @@ func (p *Producer[Request, Response]) promisesLen() int { return len(p.promises) } -// reproduce is used when Producer claims ownership on the pending -// message that was sent to inactive consumer and reinserts it into the stream, -// so that seamlessly return the answer in the same promise. -func (p *Producer[Request, Response]) reproduce(ctx context.Context, value Request, oldKey string) (*containers.Promise[Response], error) { +func (p *Producer[Request, Response]) produce(ctx context.Context, value Request) (*containers.Promise[Response], error) { val, err := json.Marshal(value) if err != nil { return nil, fmt.Errorf("marshaling value: %w", err) } - // catching the promiseLock before we sendXadd makes sure promise ids will - // be always ascending + // catching the promiseLock before we sendXadd makes sure promise ids will be always ascending p.promisesLock.Lock() defer p.promisesLock.Unlock() - id, err := p.client.XAdd(ctx, &redis.XAddArgs{ + msgId, err := p.client.XAdd(ctx, &redis.XAddArgs{ Stream: p.redisStream, Values: map[string]any{messageKey: val}, }).Result() if err != nil { return nil, fmt.Errorf("adding values to redis: %w", err) } - promise := p.promises[oldKey] - if oldKey != "" && promise == nil { - // This will happen if the old consumer became inactive but then ack_d - // the message afterwards. - // don't error - log.Warn("tried reproducing a message but it wasn't found - probably got response", "oldKey", oldKey) - } - if oldKey == "" || promise == nil { - pr := containers.NewPromise[Response](nil) - promise = &pr - } - delete(p.promises, oldKey) - p.promises[id] = promise - return promise, nil + promise := containers.NewPromise[Response](nil) + p.promises[msgId] = &promise + return &promise, nil } func (p *Producer[Request, Response]) Produce(ctx context.Context, value Request) (*containers.Promise[Response], error) { log.Debug("Redis stream producing", "value", value) p.once.Do(func() { - p.StopWaiter.CallIteratively(p.checkAndReproduce) p.StopWaiter.CallIteratively(p.checkResponses) + p.StopWaiter.CallIteratively(p.clearMessages) }) - return p.reproduce(ctx, value, "") -} - -// Check if a consumer is with specified ID is alive. -func (p *Producer[Request, Response]) isConsumerAlive(ctx context.Context, consumerID string) bool { - if _, err := p.client.Get(ctx, heartBeatKey(consumerID)).Int64(); err != nil { - return false - } - return true -} - -func (p *Producer[Request, Response]) havePromiseFor(messageID string) bool { - p.promisesLock.Lock() - defer p.promisesLock.Unlock() - _, found := p.promises[messageID] - return found -} - -// returns ids of pending messages that's worker doesn't appear alive -func (p *Producer[Request, Response]) checkPending(ctx context.Context) ([]string, error) { - pendingMessages, err := p.client.XPendingExt(ctx, &redis.XPendingExtArgs{ - Stream: p.redisStream, - Group: p.redisGroup, - Start: "-", - End: "+", - Count: p.cfg.CheckPendingItems, - }).Result() - - if err != nil && !errors.Is(err, redis.Nil) { - return nil, fmt.Errorf("querying pending messages: %w", err) - } - if len(pendingMessages) == 0 { - return nil, nil - } - if len(pendingMessages) >= int(p.cfg.CheckPendingItems) { - log.Warn("redis producer: many pending items found", "stream", p.redisStream, "check-pending-items", p.cfg.CheckPendingItems) - } - // IDs of the pending messages with inactive consumers. - var ids []string - active := make(map[string]bool) - for _, msg := range pendingMessages { - // Ignore messages not produced by this producer. - if !p.havePromiseFor(msg.ID) { - continue - } - alive, found := active[msg.Consumer] - if !found { - alive = p.isConsumerAlive(ctx, msg.Consumer) - active[msg.Consumer] = alive - } - if alive { - continue - } - ids = append(ids, msg.ID) - } - return ids, nil + return p.produce(ctx, value) } diff --git a/pubsub/pubsub_test.go b/pubsub/pubsub_test.go index 9f774b6372..8bd1aed25d 100644 --- a/pubsub/pubsub_test.go +++ b/pubsub/pubsub_test.go @@ -10,11 +10,11 @@ import ( "time" "github.com/ethereum/go-ethereum/log" - "github.com/go-redis/redis/v8" "github.com/google/go-cmp/cmp" "github.com/google/uuid" "github.com/offchainlabs/nitro/util/containers" "github.com/offchainlabs/nitro/util/redisutil" + "github.com/redis/go-redis/v9" ) var ( @@ -23,7 +23,8 @@ var ( ) type testRequest struct { - Request string + Request string + IsInvalid bool } type testResponse struct { @@ -45,36 +46,21 @@ func destroyRedisGroup(ctx context.Context, t *testing.T, streamName string, cli } } -type configOpt interface { - apply(consCfg *ConsumerConfig, prodCfg *ProducerConfig) -} - -type withReproduce struct { - reproduce bool -} - -func (e *withReproduce) apply(_ *ConsumerConfig, prodCfg *ProducerConfig) { - prodCfg.EnableReproduce = e.reproduce -} - func producerCfg() *ProducerConfig { return &ProducerConfig{ - EnableReproduce: TestProducerConfig.EnableReproduce, - CheckPendingInterval: TestProducerConfig.CheckPendingInterval, - KeepAliveTimeout: TestProducerConfig.KeepAliveTimeout, - CheckResultInterval: TestProducerConfig.CheckResultInterval, - CheckPendingItems: TestProducerConfig.CheckPendingItems, + CheckResultInterval: TestProducerConfig.CheckResultInterval, + RequestTimeout: 2 * time.Second, } } func consumerCfg() *ConsumerConfig { return &ConsumerConfig{ ResponseEntryTimeout: TestConsumerConfig.ResponseEntryTimeout, - KeepAliveTimeout: TestConsumerConfig.KeepAliveTimeout, + IdletimeToAutoclaim: TestConsumerConfig.IdletimeToAutoclaim, } } -func newProducerConsumers(ctx context.Context, t *testing.T, opts ...configOpt) (redis.UniversalClient, string, *Producer[testRequest, testResponse], []*Consumer[testRequest, testResponse]) { +func newProducerConsumers(ctx context.Context, t *testing.T) (redis.UniversalClient, string, *Producer[testRequest, testResponse], []*Consumer[testRequest, testResponse]) { t.Helper() redisClient, err := redisutil.RedisClientFromURL(redisutil.CreateTestRedis(ctx, t)) if err != nil { @@ -82,9 +68,7 @@ func newProducerConsumers(ctx context.Context, t *testing.T, opts ...configOpt) } prodCfg, consCfg := producerCfg(), consumerCfg() streamName := fmt.Sprintf("stream:%s", uuid.NewString()) - for _, o := range opts { - o.apply(consCfg, prodCfg) - } + producer, err := NewProducer[testRequest, testResponse](redisClient, streamName, prodCfg) if err != nil { t.Fatalf("Error creating new producer: %v", err) @@ -102,13 +86,6 @@ func newProducerConsumers(ctx context.Context, t *testing.T, opts ...configOpt) t.Cleanup(func() { ctx := context.Background() destroyRedisGroup(ctx, t, streamName, producer.client) - var keys []string - for _, c := range consumers { - keys = append(keys, c.heartBeatKey()) - } - if _, err := producer.client.Del(ctx, keys...).Result(); err != nil { - log.Debug("Error deleting heartbeat keys", "error", err) - } }) return redisClient, streamName, producer, consumers } @@ -125,10 +102,10 @@ func msgForIndex(idx int) string { return fmt.Sprintf("msg: %d", idx) } -func wantMessages(n int) []string { +func wantMessages(n int, group string) []string { var ret []string for i := 0; i < n; i++ { - ret = append(ret, msgForIndex(i)) + ret = append(ret, group+msgForIndex(i)) } sort.Strings(ret) return ret @@ -143,10 +120,14 @@ func flatten(responses [][]string) []string { return ret } -func produceMessages(ctx context.Context, msgs []string, producer *Producer[testRequest, testResponse]) ([]*containers.Promise[testResponse], error) { +func produceMessages(ctx context.Context, msgs []string, producer *Producer[testRequest, testResponse], withInvalidEntries bool) ([]*containers.Promise[testResponse], error) { var promises []*containers.Promise[testResponse] - for i := 0; i < messagesCount; i++ { - promise, err := producer.Produce(ctx, testRequest{Request: msgs[i]}) + for i := 0; i < len(msgs); i++ { + req := testRequest{Request: msgs[i]} + if withInvalidEntries && i%50 == 0 { + req.IsInvalid = true + } + promise, err := producer.Produce(ctx, req) if err != nil { return nil, err } @@ -197,51 +178,97 @@ func consume(ctx context.Context, t *testing.T, consumers []*Consumer[testReques continue } gotMessages[idx][res.ID] = res.Value.Request - resp := fmt.Sprintf("result for: %v", res.ID) - if err := c.SetResult(ctx, res.ID, testResponse{Response: resp}); err != nil { - t.Errorf("Error setting a result: %v", err) + if !res.Value.IsInvalid { + resp := fmt.Sprintf("result for: %v", res.ID) + if err := c.SetResult(ctx, res.ID, testResponse{Response: resp}); err != nil { + t.Errorf("Error setting a result: %v", err) + } + wantResponses[idx] = append(wantResponses[idx], resp) } - wantResponses[idx] = append(wantResponses[idx], resp) + res.Ack() } }) } return wantResponses } -func TestRedisProduce(t *testing.T) { +func TestRedisProduceComplex(t *testing.T) { log.SetDefault(log.NewLogger(log.NewTerminalHandlerWithLevel(os.Stderr, log.LevelTrace, true))) t.Parallel() for _, tc := range []struct { - name string - killConsumers bool - autoRecover bool + name string + entriesCount []int + numProducers int + killConsumers bool + withInvalidEntries bool // If this is set, then every 50th entry is invalid (requests that can't be solved by any consumer) }{ { - name: "all consumers are active", - killConsumers: false, - autoRecover: false, + name: "one producer, all consumers are active", + entriesCount: []int{messagesCount}, + numProducers: 1, + }, + { + name: "two producers, all consumers are active", + entriesCount: []int{20, 20}, + numProducers: 2, }, { - name: "some consumers killed, others should take over their work", + name: "one producer, some consumers killed, others should take over their work", + entriesCount: []int{messagesCount}, + numProducers: 1, killConsumers: true, - autoRecover: true, }, + { - name: "some consumers killed, should return failure", + name: "two producers, some consumers killed, others should take over their work, unequal number of requests from producers", + entriesCount: []int{messagesCount, 2 * messagesCount}, + numProducers: 2, killConsumers: true, - autoRecover: false, + }, + { + name: "two producers, some consumers killed, others should take over their work, some invalid entries, unequal number of requests from producers", + entriesCount: []int{messagesCount, 2 * messagesCount}, + numProducers: 2, + killConsumers: true, + withInvalidEntries: true, }, } { t.Run(tc.name, func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - redisClient, streamName, producer, consumers := newProducerConsumers(ctx, t, &withReproduce{tc.autoRecover}) - producer.Start(ctx) - wantMsgs := wantMessages(messagesCount) - promises, err := produceMessages(ctx, wantMsgs, producer) - if err != nil { - t.Fatalf("Error producing messages: %v", err) + + var producers []*Producer[testRequest, testResponse] + redisClient, streamName, producer, consumers := newProducerConsumers(ctx, t) + producers = append(producers, producer) + if tc.numProducers == 2 { + producer, err := NewProducer[testRequest, testResponse](redisClient, streamName, producerCfg()) + if err != nil { + t.Fatalf("Error creating second producer: %v", err) + } + producers = append(producers, producer) + } + + for _, producer := range producers { + producer.Start(ctx) + } + + var entries [][]string + if tc.numProducers == 2 { + entries = append(entries, wantMessages(tc.entriesCount[0], "1.")) + entries = append(entries, wantMessages(tc.entriesCount[1], "2.")) + } else { + entries = append(entries, wantMessages(tc.entriesCount[0], "")) } + + var promises [][]*containers.Promise[testResponse] + for i := 0; i < tc.numProducers; i++ { + prs, err := produceMessages(ctx, entries[i], producers[i], tc.withInvalidEntries) + if err != nil { + t.Fatalf("Error producing messages from producer%d: %v", i, err) + } + promises = append(promises, prs) + } + gotMessages := messagesMaps(len(consumers)) if tc.killConsumers { // Consumer messages in every third consumer but don't ack them to check @@ -252,40 +279,79 @@ func TestRedisProduce(t *testing.T) { if err != nil { t.Errorf("Error consuming message: %v", err) } - if !tc.autoRecover { - gotMessages[i][req.ID] = req.Value.Request + if req == nil { + t.Error("Didn't consume any message") } + // Kills the actnotifier hence allowing XAUTOCLAIM consumers[i].StopAndWait() } } + time.Sleep(time.Second) wantResponses := consume(ctx, t, consumers, gotMessages) - gotResponses, errIndexes := awaitResponses(ctx, promises) - if len(errIndexes) != 0 && tc.autoRecover { - t.Fatalf("Error awaiting responses: %v", errIndexes) + + var gotResponses []string + for i := 0; i < tc.numProducers; i++ { + grs, errIndexes := awaitResponses(ctx, promises[i]) + if tc.withInvalidEntries { + if errIndexes[len(errIndexes)-1]+50 < len(entries[i]) { + t.Fatalf("Unexpected number of invalid requests while awaiting responses") + } + for j, idx := range errIndexes { + if idx != j*50 { + t.Fatalf("Invalid request' index mismatch want: %d got %d", j*50, idx) + } + } + } else if len(errIndexes) != 0 { + t.Fatalf("Error awaiting responses from promises %d: %v", i, errIndexes) + } + gotResponses = append(gotResponses, grs...) } - producer.StopAndWait() + for _, c := range consumers { c.StopAndWait() } - got, err := mergeValues(gotMessages) + + got, err := mergeValues(gotMessages, tc.withInvalidEntries) if err != nil { t.Fatalf("mergeMaps() unexpected error: %v", err) } + // Only when there are invalid entries got will have duplicates + if tc.withInvalidEntries { + got = removeDuplicates(got) + } + + var combinedEntries []string + for i := 0; i < tc.numProducers; i++ { + combinedEntries = append(combinedEntries, entries[i]...) + } + wantMsgs := combinedEntries if diff := cmp.Diff(wantMsgs, got); diff != "" { t.Errorf("Unexpected diff (-want +got):\n%s\n", diff) } - wantResp := flatten(wantResponses) + sort.Strings(gotResponses) + wantResp := flatten(wantResponses) if diff := cmp.Diff(wantResp, gotResponses); diff != "" { t.Errorf("Unexpected diff in responses:\n%s\n", diff) } - if cnt := producer.promisesLen(); cnt != 0 { - t.Errorf("Producer still has %d unfullfilled promises", cnt) + + // Check each producers all promises were responded to + for i := 0; i < tc.numProducers; i++ { + if cnt := producers[i].promisesLen(); cnt != 0 { + t.Errorf("Producer%d still has %d unfullfilled promises", i, cnt) + } } + // Trigger a trim - producer.checkResponses(ctx) + time.Sleep(time.Second) + for i := 0; i < tc.numProducers; i++ { + producers[i].checkResponses(ctx) + producers[i].StopAndWait() + } + + // Check that no messages remain in the stream msgs, err := redisClient.XRange(ctx, streamName, "-", "+").Result() if err != nil { t.Errorf("XRange failed: %v", err) @@ -297,14 +363,27 @@ func TestRedisProduce(t *testing.T) { } } +func removeDuplicates(list []string) []string { + capture := map[string]bool{} + var ret []string + for _, elem := range list { + if _, found := capture[elem]; !found { + ret = append(ret, elem) + capture[elem] = true + } + } + sort.Strings(ret) + return ret +} + // mergeValues merges maps from the slice and returns their values. // Returns and error if there exists duplicate key. -func mergeValues(messages []map[string]string) ([]string, error) { +func mergeValues(messages []map[string]string, withInvalidEntries bool) ([]string, error) { res := make(map[string]any) var ret []string for _, m := range messages { for k, v := range m { - if _, found := res[k]; found { + if _, found := res[k]; found && !withInvalidEntries { return nil, fmt.Errorf("duplicate key: %v", k) } res[k] = v diff --git a/system_tests/common_test.go b/system_tests/common_test.go index 1cde8fd7bc..dbb2b86f13 100644 --- a/system_tests/common_test.go +++ b/system_tests/common_test.go @@ -20,7 +20,6 @@ import ( "testing" "time" - "github.com/go-redis/redis/v8" "github.com/offchainlabs/nitro/arbos" "github.com/offchainlabs/nitro/arbos/arbostypes" "github.com/offchainlabs/nitro/arbos/util" @@ -42,6 +41,7 @@ import ( "github.com/offchainlabs/nitro/validator/server_common" "github.com/offchainlabs/nitro/validator/valnode" rediscons "github.com/offchainlabs/nitro/validator/valnode/redis" + "github.com/redis/go-redis/v9" "github.com/ethereum/go-ethereum" "github.com/ethereum/go-ethereum/accounts/abi" diff --git a/system_tests/seq_coordinator_test.go b/system_tests/seq_coordinator_test.go index 1b8926a1b9..e9b2adabe8 100644 --- a/system_tests/seq_coordinator_test.go +++ b/system_tests/seq_coordinator_test.go @@ -12,7 +12,7 @@ import ( "testing" "time" - "github.com/go-redis/redis/v8" + "github.com/redis/go-redis/v9" "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/log" diff --git a/util/redisutil/redis_coordinator.go b/util/redisutil/redis_coordinator.go index 2c12ffec50..62cf76c0bd 100644 --- a/util/redisutil/redis_coordinator.go +++ b/util/redisutil/redis_coordinator.go @@ -6,7 +6,7 @@ import ( "fmt" "strings" - "github.com/go-redis/redis/v8" + "github.com/redis/go-redis/v9" "github.com/ethereum/go-ethereum/log" diff --git a/util/redisutil/redisutil.go b/util/redisutil/redisutil.go index f89c250e9a..01ba836d5b 100644 --- a/util/redisutil/redisutil.go +++ b/util/redisutil/redisutil.go @@ -1,6 +1,6 @@ package redisutil -import "github.com/go-redis/redis/v8" +import "github.com/redis/go-redis/v9" func RedisClientFromURL(url string) (redis.UniversalClient, error) { if url == "" { diff --git a/validator/client/redis/producer.go b/validator/client/redis/producer.go index adc2f34af5..c5726ffe8b 100644 --- a/validator/client/redis/producer.go +++ b/validator/client/redis/producer.go @@ -9,7 +9,6 @@ import ( "github.com/ethereum/go-ethereum/core/rawdb" "github.com/ethereum/go-ethereum/ethdb" "github.com/ethereum/go-ethereum/log" - "github.com/go-redis/redis/v8" "github.com/offchainlabs/nitro/pubsub" "github.com/offchainlabs/nitro/util/containers" "github.com/offchainlabs/nitro/util/redisutil" @@ -17,6 +16,7 @@ import ( "github.com/offchainlabs/nitro/validator" "github.com/offchainlabs/nitro/validator/server_api" "github.com/offchainlabs/nitro/validator/server_common" + "github.com/redis/go-redis/v9" "github.com/spf13/pflag" ) diff --git a/validator/valnode/redis/consumer.go b/validator/valnode/redis/consumer.go index e0d53ffb2e..4392a3c91e 100644 --- a/validator/valnode/redis/consumer.go +++ b/validator/valnode/redis/consumer.go @@ -161,9 +161,13 @@ func (s *ValidationServer) Start(ctx_in context.Context) { res, err := valRun.Await(ctx) if err != nil { log.Error("Error validating", "request value", work.req.Value, "error", err) + work.req.Ack() } else { log.Debug("done work", "thread", i, "workid", work.req.ID) - if err := s.consumers[work.moduleRoot].SetResult(ctx, work.req.ID, res); err != nil { + err := s.consumers[work.moduleRoot].SetResult(ctx, work.req.ID, res) + // Even in error we close ackNotifier as there's no retry mechanism here and closing it will alow other consumers to autoclaim + work.req.Ack() + if err != nil { log.Error("Error setting result for request", "id", work.req.ID, "result", res, "error", err) } log.Debug("set result", "thread", i, "workid", work.req.ID)