Skip to content

Commit

Permalink
fix(metastore): local raft server id (#3530)
Browse files Browse the repository at this point in the history
{Suffrage:Voter ID:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local. Address:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local.:9099}

{Suffrage:Voter ID:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local.:9099 Address:pyroscope-metastore-2.pyroscope-metastore-headless.pyroscope-test.svc.cluster.local.:9099}]"

- fix bootstrap of metastore with correct server id
- add bootstrap retries
  • Loading branch information
korniltsev authored Aug 29, 2024
1 parent 7709a22 commit b2ce6a2
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 7 deletions.
5 changes: 5 additions & 0 deletions pkg/experiment/metastore/metastore.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/dns"
"github.com/grafana/dskit/flagext"
"github.com/grafana/dskit/grpcclient"
"github.com/grafana/dskit/services"
Expand Down Expand Up @@ -130,6 +131,8 @@ type Metastore struct {
metrics *metastoreMetrics
client *metastoreclient.Client
readySince time.Time

dnsProvider *dns.Provider
}

type Limits interface{}
Expand Down Expand Up @@ -226,6 +229,8 @@ func (m *Metastore) initRaft() (err error) {
if err = m.bootstrap(); err != nil {
return fmt.Errorf("failed to bootstrap cluster: %w", err)
}
} else {
_ = level.Info(m.logger).Log("msg", "restoring existing state, not bootstraping")
}

m.leaderhealth.Register(m.raft, metastoreRaftLeaderHealthServiceName)
Expand Down
41 changes: 35 additions & 6 deletions pkg/experiment/metastore/metastore_bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"github.com/grafana/dskit/backoff"
"net"
"slices"
"strings"
Expand All @@ -16,7 +17,7 @@ import (
)

func (m *Metastore) bootstrap() error {
peers, err := m.bootstrapPeers()
peers, err := m.bootstrapPeersWithRetries()
if err != nil {
return fmt.Errorf("failed to resolve peers: %w", err)
}
Expand Down Expand Up @@ -66,11 +67,13 @@ func (m *Metastore) bootstrapPeers() ([]raft.Server, error) {
if len(resolve) > 0 {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
prov := dns.NewProvider(m.logger, m.reg, dns.MiekgdnsResolverType)
if err := prov.Resolve(ctx, resolve); err != nil {
if m.dnsProvider == nil {
m.dnsProvider = dns.NewProvider(m.logger, m.reg, dns.MiekgdnsResolverType)
}
if err := m.dnsProvider.Resolve(ctx, resolve); err != nil {
return nil, fmt.Errorf("failed to resolve bootstrap peers: %w", err)
}
resolvedPeers := prov.Addresses()
resolvedPeers := m.dnsProvider.Addresses()
if len(resolvedPeers) == 0 {
// The local node is the only one in the cluster, but peers
// were supposed to be present. Stop here to avoid bootstrapping
Expand All @@ -95,8 +98,8 @@ func (m *Metastore) bootstrapPeers() ([]raft.Server, error) {
return a.ID == b.ID
})
if len(peers) != m.config.Raft.BootstrapExpectPeers {
return nil, fmt.Errorf("expected number of bootstrap peers not reached: got %d, expected %d",
len(peers), m.config.Raft.BootstrapExpectPeers)
return nil, fmt.Errorf("expected number of bootstrap peers not reached: got %d, expected %d\n%+v",
len(peers), m.config.Raft.BootstrapExpectPeers, peers)
}
return peers, nil
}
Expand Down Expand Up @@ -127,3 +130,29 @@ func parsePeer(raw string) raft.Server {
Address: raft.ServerAddress(addr),
}
}

func (m *Metastore) bootstrapPeersWithRetries() (peers []raft.Server, err error) {
attempt := func() bool {
peers, err = m.bootstrapPeers()
level.Debug(m.logger).Log("msg", "resolving bootstrap peers", "peers", fmt.Sprint(peers), "err", err)
if err != nil {
_ = level.Error(m.logger).Log("msg", "failed to resolve bootstrap peers", "err", err)
return false
}
return true
}
backoffConfig := backoff.Config{
MinBackoff: 1 * time.Second,
MaxBackoff: 10 * time.Second,
MaxRetries: 20,
}
backoff := backoff.New(context.Background(), backoffConfig)
for backoff.Ongoing() {
if !attempt() {
backoff.Wait()
} else {
return peers, nil
}
}
return nil, fmt.Errorf("failed to resolve bootstrap peers after %d retries %w", backoff.NumRetries(), err)
}
2 changes: 1 addition & 1 deletion tools/dev/experiment/values-micro-services-experiment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pyroscope:
query-backend.address: "dns:///_grpc._tcp.pyroscope-query-worker-headless.$(NAMESPACE_FQDN):9095"
metastore.address: "dns:///_grpc._tcp.pyroscope-metastore-headless.$(NAMESPACE_FQDN):9095"
metastore.raft.bind-address: ":9099"
metastore.raft.server-id: "$(POD_NAME).pyroscope-metastore-headless.$(NAMESPACE_FQDN)"
metastore.raft.server-id: "$(POD_NAME).pyroscope-metastore-headless.$(NAMESPACE_FQDN):9099"
metastore.raft.advertise-address: "$(POD_NAME).pyroscope-metastore-headless.$(NAMESPACE_FQDN):9099"
metastore.raft.bootstrap-peers: "dnssrvnoa+_raft._tcp.pyroscope-metastore-headless.$(NAMESPACE_FQDN):9099"
metastore.raft.bootstrap-expect-peers: "3"
Expand Down

0 comments on commit b2ce6a2

Please sign in to comment.