Skip to content

Commit

Permalink
Return GNMI API error when ZMQ operation failed. (#270)
Browse files Browse the repository at this point in the history
Return GNMI API error when ZMQ operation failed. 

#### Why I did it
When ZMQ is full, GNMI service crash.

#### How I did it
Return API error instead of panic. 

#### How to verify it
Manually test.

#### Work item tracking
Microsoft ADO (number only): 28694022

#### Which release branch to backport (provide reason below if selected)

<!--
- Note we only backport fixes to a release branch, *not* features!
- Please also provide a reason for the backporting below.
- e.g.
- [x] 202006
-->

- [ ] 201811
- [ ] 201911
- [ ] 202006
- [ ] 202012
- [ ] 202106
- [ ] 202111

#### Description for the changelog
Return GNMI API error when ZMQ operation failed. 

#### Link to config_db schema for YANG module changes
<!--
Provide a link to config_db schema for the table for which YANG model
is defined
Link should point to correct section on https://github.com/Azure/SONiC/wiki/Configuration.
-->

#### A picture of a cute animal (not mandatory but encouraged)
  • Loading branch information
liuh-80 authored Jul 15, 2024
1 parent a85dfc1 commit ccce9a2
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 25 deletions.
34 changes: 33 additions & 1 deletion sonic_data_client/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"encoding/json"
"fmt"

"github.com/agiledragon/gomonkey/v2"
"github.com/jipanyang/gnxi/utils/xpath"
"github.com/sonic-net/sonic-gnmi/swsscommon"
"github.com/sonic-net/sonic-gnmi/test_utils"
Expand Down Expand Up @@ -557,7 +558,7 @@ func TestRetryHelper(t *testing.T) {
exeCount++
if returnError {
returnError = false
return fmt.Errorf("connection_reset")
return fmt.Errorf("zmq connection break, endpoint: tcp://127.0.0.1:2234")
}
return nil
})
Expand All @@ -574,6 +575,37 @@ func TestRetryHelper(t *testing.T) {
swsscommon.DeleteZmqServer(zmqServer)
}

func TestRetryHelperReconnect(t *testing.T) {
// create ZMQ server
zmqServer := swsscommon.NewZmqServer("tcp://*:2234")

// when config table is empty, will authorize with PopulateAuthStruct
zmqClientRemoved := false
mockremoveZmqClient := gomonkey.ApplyFunc(removeZmqClient, func(zmqClient swsscommon.ZmqClient) (error) {
zmqClientRemoved = true
return nil
})
defer mockremoveZmqClient.Reset()

// create ZMQ client side
zmqAddress := "tcp://127.0.0.1:2234"
zmqClient := swsscommon.NewZmqClient(zmqAddress)
exeCount := 0
RetryHelper(
zmqClient,
func () (err error) {
exeCount++
return fmt.Errorf("zmq connection break, endpoint: tcp://127.0.0.1:2234")
})

if !zmqClientRemoved {
t.Errorf("RetryHelper does not remove ZMQ client for reconnect")
}

swsscommon.DeleteZmqClient(zmqClient)
swsscommon.DeleteZmqServer(zmqServer)
}

func TestGetDpuAddress(t *testing.T) {
// prepare data according to design doc
// Design doc: https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/ip-address-assigment/smart-switch-ip-address-assignment.md?plain=1
Expand Down
44 changes: 20 additions & 24 deletions sonic_data_client/mixed_db_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,33 +285,33 @@ func ProducerStateTableDeleteWrapper(pt swsscommon.ProducerStateTable, key strin

type ActionNeedRetry func() error

func RetryHelper(zmqClient swsscommon.ZmqClient, action ActionNeedRetry) {
func RetryHelper(zmqClient swsscommon.ZmqClient, action ActionNeedRetry) error {
var retry uint = 0
var retry_delay = time.Duration(RETRY_DELAY_MILLISECOND) * time.Millisecond
ConnectionResetErr := "connection_reset"
ConnectionResetErr := "zmq connection break"
for {
err := action()
if err != nil {
if (err.Error() == ConnectionResetErr && retry <= MAX_RETRY_COUNT) {
log.V(6).Infof("RetryHelper: connection reset, reconnect and retry later")
time.Sleep(retry_delay)

zmqClient.Connect()
retry_delay *= time.Duration(RETRY_DELAY_FACTOR)
retry++
continue
}
if strings.Contains(err.Error(), ConnectionResetErr) {
if (retry <= MAX_RETRY_COUNT) {
log.V(6).Infof("RetryHelper: connection reset, reconnect and retry later")
time.Sleep(retry_delay)

zmqClient.Connect()
retry_delay *= time.Duration(RETRY_DELAY_FACTOR)
retry++
continue
}

// Force re-create ZMQ client
removeZmqErr := removeZmqClient(zmqClient)
if removeZmqErr != nil {
log.V(6).Infof("RetryHelper: remove ZMQ client error: %v", removeZmqErr)
// Force re-create ZMQ client when connection reset
removeZmqErr := removeZmqClient(zmqClient)
if removeZmqErr != nil {
log.V(6).Infof("RetryHelper: remove ZMQ client error: %v", removeZmqErr)
}
}

panic(err)
}

return
return err
}
}

Expand All @@ -325,24 +325,20 @@ func (c *MixedDbClient) DbSetTable(table string, key string, values map[string]s
}

pt := c.GetTable(table)
RetryHelper(
return RetryHelper(
c.zmqClient,
func () error {
return ProducerStateTableSetWrapper(pt, key, vec)
})

return nil
}

func (c *MixedDbClient) DbDelTable(table string, key string) error {
pt := c.GetTable(table)
RetryHelper(
return RetryHelper(
c.zmqClient,
func () error {
return ProducerStateTableDeleteWrapper(pt, key)
})

return nil
}

// For example, the GNMI path below points to DASH_QOS table in the DPU_APPL_DB database for dpu0:
Expand Down

0 comments on commit ccce9a2

Please sign in to comment.