From b296709cf4f3cb0a83c9c7482d89c5ce24ad9c71 Mon Sep 17 00:00:00 2001 From: Anton Kolesnikov Date: Wed, 3 Jul 2024 20:32:57 +0800 Subject: [PATCH] Add metastore client (#3397) --- .../index.md | 4 +- .../rendered/micro-services-hpa.yaml | 101 +++++-------- .../pyroscope/rendered/micro-services.yaml | 117 ++++++--------- .../pyroscope/rendered/single-binary.yaml | 59 +------- .../templates/deployments-statefulsets.yaml | 11 +- .../templates/service-metastore.yaml | 41 ----- .../helm/pyroscope/templates/services.yaml | 11 ++ pkg/frontend/frontend.go | 12 +- pkg/metastore/client/client.go | 48 +++++- pkg/metastore/metastore.go | 45 +++--- pkg/metastore/metastore_bootstrap.go | 142 ++++++++---------- pkg/metastore/metastore_test.go | 90 ----------- pkg/metastore/raftleader/raftleader.go | 109 ++++++++++++++ pkg/phlare/modules.go | 20 +-- pkg/phlare/phlare.go | 33 ++-- pkg/util/health/health.go | 33 ++++ 16 files changed, 422 insertions(+), 454 deletions(-) delete mode 100644 operations/pyroscope/helm/pyroscope/templates/service-metastore.yaml delete mode 100644 pkg/metastore/metastore_test.go create mode 100644 pkg/metastore/raftleader/raftleader.go create mode 100644 pkg/util/health/health.go diff --git a/docs/sources/configure-server/reference-configuration-parameters/index.md b/docs/sources/configure-server/reference-configuration-parameters/index.md index 1982aa0bfa..87ab932618 100644 --- a/docs/sources/configure-server/reference-configuration-parameters/index.md +++ b/docs/sources/configure-server/reference-configuration-parameters/index.md @@ -161,10 +161,10 @@ metastore: [bootstrap_peers: | default = []] # CLI flag: -metastore.raft.server-id - [server_id: | default = "localhost"] + [server_id: | default = "localhost:9099"] # CLI flag: -metastore.raft.bind-address - [bind_address: | default = ":9099"] + [bind_address: | default = "localhost:9099"] # CLI flag: -metastore.raft.advertise-address [advertise_address: | default = "localhost:9099"] diff --git a/operations/pyroscope/helm/pyroscope/rendered/micro-services-hpa.yaml b/operations/pyroscope/helm/pyroscope/rendered/micro-services-hpa.yaml index f2c89a6152..dae81d3e68 100644 --- a/operations/pyroscope/helm/pyroscope/rendered/micro-services-hpa.yaml +++ b/operations/pyroscope/helm/pyroscope/rendered/micro-services-hpa.yaml @@ -1720,58 +1720,6 @@ spec: # TODO: Ensure only services that offer memberlist register # pyroscope.grafana.com/memberlist: "true" --- -# Source: pyroscope/templates/service-metastore.yaml -apiVersion: v1 -kind: Service -metadata: - name: pyroscope-dev-metastore-grpc - namespace: default - labels: - helm.sh/chart: pyroscope-1.6.1 - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/version: "1.6.1" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-grpc - port: 9095 - protocol: TCP - targetPort: 9095 - publishNotReadyAddresses: true - selector: - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/component: "metastore" ---- -# Source: pyroscope/templates/service-metastore.yaml -apiVersion: v1 -kind: Service -metadata: - name: pyroscope-dev-metastore-raft - namespace: default - labels: - helm.sh/chart: pyroscope-1.6.1 - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/version: "1.6.1" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-raft - port: 9099 - protocol: TCP - targetPort: 9099 - publishNotReadyAddresses: false - selector: - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/component: "metastore" ---- # Source: pyroscope/templates/service-query-worker.yaml apiVersion: v1 kind: Service @@ -2208,8 +2156,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2303,8 +2254,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2398,8 +2352,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2493,8 +2450,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2889,8 +2849,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2992,8 +2955,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -3091,8 +3057,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" diff --git a/operations/pyroscope/helm/pyroscope/rendered/micro-services.yaml b/operations/pyroscope/helm/pyroscope/rendered/micro-services.yaml index ac4a861001..de5a300438 100644 --- a/operations/pyroscope/helm/pyroscope/rendered/micro-services.yaml +++ b/operations/pyroscope/helm/pyroscope/rendered/micro-services.yaml @@ -1741,58 +1741,6 @@ spec: # TODO: Ensure only services that offer memberlist register # pyroscope.grafana.com/memberlist: "true" --- -# Source: pyroscope/templates/service-metastore.yaml -apiVersion: v1 -kind: Service -metadata: - name: pyroscope-dev-metastore-grpc - namespace: default - labels: - helm.sh/chart: pyroscope-1.6.1 - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/version: "1.6.1" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-grpc - port: 9095 - protocol: TCP - targetPort: 9095 - publishNotReadyAddresses: true - selector: - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/component: "metastore" ---- -# Source: pyroscope/templates/service-metastore.yaml -apiVersion: v1 -kind: Service -metadata: - name: pyroscope-dev-metastore-raft - namespace: default - labels: - helm.sh/chart: pyroscope-1.6.1 - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/version: "1.6.1" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-raft - port: 9099 - protocol: TCP - targetPort: 9099 - publishNotReadyAddresses: false - selector: - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/component: "metastore" ---- # Source: pyroscope/templates/service-query-worker.yaml apiVersion: v1 kind: Service @@ -2018,6 +1966,15 @@ spec: targetPort: http2 protocol: TCP name: http2 + - name: grpc + port: 9095 + protocol: TCP + targetPort: 9095 + - name: raft + port: 9099 + protocol: TCP + targetPort: 9099 + publishNotReadyAddresses: true selector: app.kubernetes.io/name: pyroscope app.kubernetes.io/instance: pyroscope-dev @@ -2281,8 +2238,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2377,8 +2337,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2473,8 +2436,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2569,8 +2535,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2850,8 +2819,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -2953,8 +2925,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -3052,8 +3027,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" @@ -3155,8 +3133,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" diff --git a/operations/pyroscope/helm/pyroscope/rendered/single-binary.yaml b/operations/pyroscope/helm/pyroscope/rendered/single-binary.yaml index a3565938a6..0fbc1fb7e6 100644 --- a/operations/pyroscope/helm/pyroscope/rendered/single-binary.yaml +++ b/operations/pyroscope/helm/pyroscope/rendered/single-binary.yaml @@ -1178,58 +1178,6 @@ spec: # TODO: Ensure only services that offer memberlist register # pyroscope.grafana.com/memberlist: "true" --- -# Source: pyroscope/templates/service-metastore.yaml -apiVersion: v1 -kind: Service -metadata: - name: pyroscope-dev-metastore-grpc - namespace: default - labels: - helm.sh/chart: pyroscope-1.6.1 - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/version: "1.6.1" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-grpc - port: 9095 - protocol: TCP - targetPort: 9095 - publishNotReadyAddresses: true - selector: - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/component: "metastore" ---- -# Source: pyroscope/templates/service-metastore.yaml -apiVersion: v1 -kind: Service -metadata: - name: pyroscope-dev-metastore-raft - namespace: default - labels: - helm.sh/chart: pyroscope-1.6.1 - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/version: "1.6.1" - app.kubernetes.io/managed-by: Helm -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-raft - port: 9099 - protocol: TCP - targetPort: 9099 - publishNotReadyAddresses: false - selector: - app.kubernetes.io/name: pyroscope - app.kubernetes.io/instance: pyroscope-dev - app.kubernetes.io/component: "metastore" ---- # Source: pyroscope/templates/service-query-worker.yaml apiVersion: v1 kind: Service @@ -1451,8 +1399,11 @@ spec: - "-server.http-listen-port=4040" - "-memberlist.cluster-label=default-pyroscope-dev" - "-memberlist.join=dns+pyroscope-dev-memberlist.default.svc.cluster.local.:7946" - - "-metastore.address=dns:///pyroscope-dev-metastore-grpc.default.svc.cluster.local.:9095" - - "-metastore.raft.bootstrap-peers=dns:///pyroscope-dev-metastore-raft.default.svc.cluster.local.:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.server-id=($HOSTNAME).pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9099" + - "-metastore.address=dns:///_grpc._tcp.pyroscope-dev-metastore-headless.default.svc.cluster.local.:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" - "-log.level=debug" diff --git a/operations/pyroscope/helm/pyroscope/templates/deployments-statefulsets.yaml b/operations/pyroscope/helm/pyroscope/templates/deployments-statefulsets.yaml index 5cb92d06a5..5062f049d9 100644 --- a/operations/pyroscope/helm/pyroscope/templates/deployments-statefulsets.yaml +++ b/operations/pyroscope/helm/pyroscope/templates/deployments-statefulsets.yaml @@ -65,8 +65,11 @@ spec: - "-server.http-listen-port={{ $values.service.port }}" - "-memberlist.cluster-label={{ .Release.Namespace }}-{{ include "pyroscope.fullname" .}}" - "-memberlist.join=dns+{{ include "pyroscope.fullname" .}}-memberlist.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:{{ .Values.pyroscope.memberlist.port }}" - - "-metastore.address=dns:///{{ include "pyroscope.fullname" .}}-metastore-grpc.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:9095" - - "-metastore.raft.bootstrap-peers=dns:///{{ include "pyroscope.fullname" .}}-metastore-raft.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:9099" + - "-metastore.raft.bind-address=:9099" + - "-metastore.raft.advertise-address=($HOSTNAME).{{ include "pyroscope.fullname" .}}-metastore-headless.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:9099" + - "-metastore.raft.server-id=($HOSTNAME).{{ include "pyroscope.fullname" .}}-metastore-headless.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:9099" + - "-metastore.raft.bootstrap-peers=dnssrvnoa+_raft._tcp.{{ include "pyroscope.fullname" .}}-metastore-headless.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:9099" + - "-metastore.address=dns:///_grpc._tcp.{{ include "pyroscope.fullname" .}}-metastore-headless.{{ .Release.Namespace }}.svc{{ .Values.pyroscope.cluster_domain }}:9095" - "-config.file=/etc/pyroscope/config.yaml" - "-runtime-config.file=/etc/pyroscope/overrides/overrides.yaml" {{- range $key, $value := $extraArgs }} @@ -74,6 +77,10 @@ spec: {{- end }} {{- with $values.extraEnvVars }} env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: metadata.name {{- range $key, $value := . }} - name: {{ $key }} {{- if kindIs "map" $value }} diff --git a/operations/pyroscope/helm/pyroscope/templates/service-metastore.yaml b/operations/pyroscope/helm/pyroscope/templates/service-metastore.yaml deleted file mode 100644 index 49bae9c3d2..0000000000 --- a/operations/pyroscope/helm/pyroscope/templates/service-metastore.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: {{ template "pyroscope.fullname" . }}-metastore-grpc - namespace: {{ .Release.Namespace }} - labels: - {{- include "pyroscope.labels" . | nindent 4 }} -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-grpc - port: 9095 - protocol: TCP - targetPort: 9095 - publishNotReadyAddresses: true - selector: - {{- include "pyroscope.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: "metastore" - ---- -apiVersion: v1 -kind: Service -metadata: - name: {{ template "pyroscope.fullname" . }}-metastore-raft - namespace: {{ .Release.Namespace }} - labels: - {{- include "pyroscope.labels" . | nindent 4 }} -spec: - type: ClusterIP - clusterIP: None - ports: - - name: metastore-raft - port: 9099 - protocol: TCP - targetPort: 9099 - publishNotReadyAddresses: false - selector: - {{- include "pyroscope.selectorLabels" . | nindent 4 }} - app.kubernetes.io/component: "metastore" diff --git a/operations/pyroscope/helm/pyroscope/templates/services.yaml b/operations/pyroscope/helm/pyroscope/templates/services.yaml index d20d2a5aae..28ff64b84f 100644 --- a/operations/pyroscope/helm/pyroscope/templates/services.yaml +++ b/operations/pyroscope/helm/pyroscope/templates/services.yaml @@ -45,6 +45,17 @@ spec: targetPort: {{ $values.service.port_name }} protocol: TCP name: {{ $values.service.port_name }} + {{- if eq $component "metastore" }} + - name: grpc + port: 9095 + protocol: TCP + targetPort: 9095 + - name: raft + port: 9099 + protocol: TCP + targetPort: 9099 + publishNotReadyAddresses: true + {{- end }} selector: {{- include "pyroscope.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: {{ $component | quote }} diff --git a/pkg/frontend/frontend.go b/pkg/frontend/frontend.go index 03f8c7bb6c..5a2844db5e 100644 --- a/pkg/frontend/frontend.go +++ b/pkg/frontend/frontend.go @@ -21,17 +21,15 @@ import ( "github.com/grafana/dskit/grpcclient" "github.com/grafana/dskit/netutil" "github.com/grafana/dskit/services" + "github.com/grafana/dskit/tenant" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "go.uber.org/atomic" - "google.golang.org/grpc" - - "github.com/grafana/dskit/tenant" - metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" "github.com/grafana/pyroscope/pkg/frontend/frontendpb" + metastoreclient "github.com/grafana/pyroscope/pkg/metastore/client" "github.com/grafana/pyroscope/pkg/querier/stats" "github.com/grafana/pyroscope/pkg/scheduler/schedulerdiscovery" "github.com/grafana/pyroscope/pkg/util/connectgrpc" @@ -96,7 +94,7 @@ type Frontend struct { requests *requestsInProgress frontendpb.UnimplementedFrontendForQuerierServer - metastoreclient metastorev1.MetastoreServiceClient + metastoreclient *metastoreclient.Client } type Limits interface { @@ -137,7 +135,7 @@ type enqueueResult struct { } // NewFrontend creates a new frontend. -func NewFrontend(cfg Config, limits Limits, log log.Logger, reg prometheus.Registerer, metastorecc grpc.ClientConnInterface) (*Frontend, error) { +func NewFrontend(cfg Config, limits Limits, log log.Logger, reg prometheus.Registerer, mc *metastoreclient.Client) (*Frontend, error) { requestsCh := make(chan *frontendRequest) schedulerWorkers, err := newFrontendSchedulerWorkers(cfg, fmt.Sprintf("%s:%d", cfg.Addr, cfg.Port), requestsCh, log, reg) @@ -153,7 +151,7 @@ func NewFrontend(cfg Config, limits Limits, log log.Logger, reg prometheus.Regis schedulerWorkers: schedulerWorkers, schedulerWorkersWatcher: services.NewFailureWatcher(), requests: newRequestsInProgress(), - metastoreclient: metastorev1.NewMetastoreServiceClient(metastorecc), + metastoreclient: mc, } if err != nil { diff --git a/pkg/metastore/client/client.go b/pkg/metastore/client/client.go index aa8e96d49b..10cbe70707 100644 --- a/pkg/metastore/client/client.go +++ b/pkg/metastore/client/client.go @@ -5,7 +5,10 @@ import ( "fmt" "github.com/grafana/dskit/grpcclient" + "github.com/grafana/dskit/services" "google.golang.org/grpc" + + metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" ) type Config struct { @@ -25,7 +28,29 @@ func (cfg *Config) Validate() error { return cfg.GRPCClientConfig.Validate() } -func Dial(cfg Config) (*grpc.ClientConn, error) { +type Client struct { + metastorev1.MetastoreServiceClient + service services.Service + conn *grpc.ClientConn + config Config +} + +func New(config Config) (c *Client, err error) { + c = &Client{config: config} + c.conn, err = dial(c.config) + if err != nil { + return nil, err + } + c.MetastoreServiceClient = metastorev1.NewMetastoreServiceClient(c.conn) + c.service = services.NewIdleService(nil, c.stopping) + return c, nil +} + +func (c *Client) stopping(error) error { return c.conn.Close() } + +func (c *Client) Service() services.Service { return c.service } + +func dial(cfg Config) (*grpc.ClientConn, error) { if err := cfg.Validate(); err != nil { return nil, err } @@ -33,7 +58,26 @@ func Dial(cfg Config) (*grpc.ClientConn, error) { if err != nil { return nil, err } - const grpcServiceConfig = `{"loadBalancingPolicy":"round_robin"}` + // TODO: https://github.com/grpc/grpc-proto/blob/master/grpc/service_config/service_config.proto options = append(options, grpc.WithDefaultServiceConfig(grpcServiceConfig)) return grpc.Dial(cfg.MetastoreAddress, options...) + } + +const grpcServiceConfig = `{ + "healthCheckConfig": { + "serviceName": "metastore.v1.MetastoreService.RaftLeader" + }, + "loadBalancingPolicy":"round_robin", + "methodConfig": [{ + "name": [{"service": "metastore.v1.MetastoreService"}], + "waitForReady": true, + "retryPolicy": { + "MaxAttempts": 4, + "InitialBackoff": ".01s", + "MaxBackoff": ".01s", + "BackoffMultiplier": 1.0, + "RetryableStatusCodes": [ "UNAVAILABLE" ] + } + }] +}` diff --git a/pkg/metastore/metastore.go b/pkg/metastore/metastore.go index aaa451c689..838af4242f 100644 --- a/pkg/metastore/metastore.go +++ b/pkg/metastore/metastore.go @@ -20,8 +20,12 @@ import ( "github.com/prometheus/client_golang/prometheus" metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" + "github.com/grafana/pyroscope/pkg/metastore/raftleader" + "github.com/grafana/pyroscope/pkg/util/health" ) +const metastoreRaftLeaderHealthServiceName = "metastore.v1.MetastoreService.RaftLeader" + type Config struct { DataDir string `yaml:"data_dir"` Raft RaftConfig `yaml:"raft"` @@ -48,13 +52,10 @@ func (cfg *RaftConfig) RegisterFlags(f *flag.FlagSet) { const prefix = "metastore.raft." f.StringVar(&cfg.Dir, prefix+"dir", "./data-metastore/raft", "") f.Var((*flagext.StringSlice)(&cfg.BootstrapPeers), prefix+"bootstrap-peers", "") - f.StringVar(&cfg.ServerID, prefix+"server-id", "localhost", "") - f.StringVar(&cfg.BindAddress, prefix+"bind-address", ":9099", "") + f.StringVar(&cfg.BindAddress, prefix+"bind-address", "localhost:9099", "") + f.StringVar(&cfg.ServerID, prefix+"server-id", "localhost:9099", "") f.StringVar(&cfg.AdvertiseAddress, prefix+"advertise-address", "localhost:9099", "") f.DurationVar(&cfg.ApplyTimeout, prefix+"apply-timeout", 5*time.Second, "") - if len(cfg.BootstrapPeers) == 0 { - cfg.BootstrapPeers = []string{cfg.AdvertiseAddress + "/" + cfg.ServerID} - } } type Metastore struct { @@ -73,10 +74,11 @@ type Metastore struct { db *boltdb // Raft module. - wal *raftwal.WAL - snapshots *raft.FileSnapshotStore - transport *raft.NetworkTransport - raft *raft.Raft + wal *raftwal.WAL + snapshots *raft.FileSnapshotStore + transport *raft.NetworkTransport + raft *raft.Raft + leaderhealth *raftleader.HealthObserver logStore raft.LogStore stableStore raft.StableStore @@ -87,7 +89,7 @@ type Metastore struct { type Limits interface{} -func New(config Config, limits Limits, logger log.Logger, reg prometheus.Registerer) (*Metastore, error) { +func New(config Config, limits Limits, logger log.Logger, reg prometheus.Registerer, hs health.Service) (*Metastore, error) { m := &Metastore{ config: config, logger: logger, @@ -95,21 +97,12 @@ func New(config Config, limits Limits, logger log.Logger, reg prometheus.Registe limits: limits, db: newDB(config, logger), } + m.leaderhealth = raftleader.NewRaftLeaderHealthObserver(hs, logger) m.state = newMetastoreState(logger, m.db) m.Service = services.NewBasicService(m.starting, m.running, m.stopping) return m, nil } -func (m *Metastore) CheckReady(_ context.Context) error { - if s := m.State(); s != services.Running && s != services.Stopping { - return fmt.Errorf("not ready: %v", s) - } - // TODO(kolesnikovae): - // On boot, get the leader commit index, and report readiness - // only when the local _apply_ index matches one. - return nil -} - func (m *Metastore) Shutdown() error { m.shutdownRaft() m.db.shutdown() @@ -168,9 +161,13 @@ func (m *Metastore) initRaft() (err error) { } config := raft.DefaultConfig() + // TODO: Wrap gokit + // config.Logger config.TrailingLogs = raftTrailingLogs config.SnapshotThreshold = raftSnapshotThreshold config.SnapshotInterval = raftSnapshotInterval + // TODO: We don't need to restore the latest snapshot + // on start, because the FSM is already disk-based. // config.NoSnapshotRestoreOnStart = true config.LocalID = raft.ServerID(m.config.Raft.ServerID) @@ -187,6 +184,7 @@ func (m *Metastore) initRaft() (err error) { } } + m.leaderhealth.Register(m.raft, metastoreRaftLeaderHealthServiceName) return nil } @@ -224,8 +222,12 @@ func (m *Metastore) createRaftDirs() (err error) { } func (m *Metastore) shutdownRaft() { - // If raft has been initialized, try to transfer leadership. if m.raft != nil { + // If raft has been initialized, try to transfer leadership. + // Only after this we remove the leader health observer and + // shutdown the raft. + // There is a chance that client will still be trying to connect + // to this instance, therefore retrying is still required. if err := m.raft.LeadershipTransfer().Error(); err != nil { switch { case errors.Is(err, raft.ErrNotLeader): @@ -236,6 +238,7 @@ func (m *Metastore) shutdownRaft() { _ = level.Error(m.logger).Log("msg", "failed to transfer leadership", "err", err) } } + m.leaderhealth.Deregister(m.raft, metastoreRaftLeaderHealthServiceName) if err := m.raft.Shutdown().Error(); err != nil { _ = level.Error(m.logger).Log("msg", "failed to shutdown raft", "err", err) } diff --git a/pkg/metastore/metastore_bootstrap.go b/pkg/metastore/metastore_bootstrap.go index 2ef632990c..d6649f7a8a 100644 --- a/pkg/metastore/metastore_bootstrap.go +++ b/pkg/metastore/metastore_bootstrap.go @@ -5,13 +5,13 @@ import ( "errors" "fmt" "net" - "net/url" "slices" - "strconv" "strings" "time" + "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/dns" "github.com/hashicorp/raft" ) @@ -23,12 +23,15 @@ func (m *Metastore) bootstrap() error { if len(peers) == 0 { return fmt.Errorf("no peers found") } - if raft.ServerID(m.config.Raft.ServerID) != peers[0].ID { - _ = level.Info(m.logger).Log("msg", "skipping raft bootstrap", - "local", m.config.Raft.ServerID, - "peers", fmt.Sprint(peers)) + logger := log.With(m.logger, + "server_id", m.config.Raft.ServerID, + "advertise_address", m.config.Raft.AdvertiseAddress, + "peers", fmt.Sprint(peers)) + if raft.ServerAddress(m.config.Raft.AdvertiseAddress) != peers[0].Address { + _ = level.Info(logger).Log("msg", "local state found, skipping raft bootstrap") return nil } + _ = level.Info(logger).Log("msg", "bootstrapping raft") bootstrap := m.raft.BootstrapCluster(raft.Configuration{Servers: peers}) if bootstrapErr := bootstrap.Error(); bootstrapErr != nil { if !errors.Is(bootstrapErr, raft.ErrCantBootstrap) { @@ -39,89 +42,76 @@ func (m *Metastore) bootstrap() error { } func (m *Metastore) bootstrapPeers() ([]raft.Server, error) { - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() + // If no bootstrap peers are provided, we bootstrap a single-node cluster. + if len(m.config.Raft.BootstrapPeers) == 0 { + return []raft.Server{{ + Suffrage: raft.Voter, + ID: raft.ServerID(m.config.Raft.ServerID), + Address: raft.ServerAddress(m.config.Raft.AdvertiseAddress), + }}, nil + } + // Otherwise, we build the list of peers, resolving the names if needed. + // + // Note that raft requires stable node IDs, therefore we're using + // the node FQDN:port for both purposes: as the identifier and as the + // address. This requires a DNS SRV record lookup without further + // resolution of A records (dnssrvnoa+). + // + // Alternatively, peers may be specified explicitly in the + // "{addr}" format, where the node is the optional node + // identifier. var peers []raft.Server - for _, addr := range m.config.Raft.BootstrapPeers { - parsed, err := peerFromAddress(ctx, addr) - if err != nil { - return nil, err + var resolve []string + for _, peer := range m.config.Raft.BootstrapPeers { + if strings.Contains(peer, "+") { + resolve = append(resolve, peer) + } else { + peers = append(peers, parsePeer(peer)) } - peers = append(peers, parsed...) - } - slices.SortFunc(peers, func(a, b raft.Server) int { - return strings.Compare(string(a.ID), string(b.ID)) - }) - return peers, nil -} - -func peerFromAddress(ctx context.Context, addr string) ([]raft.Server, error) { - if name, found := strings.CutPrefix(addr, "dns+"); found { - return lookupPeers(ctx, name) } - p, err := url.Parse(addr) - if err != nil { - return nil, err - } - if p.Scheme == "dns" { - // dns://[authority]/{host[:port]} - // DNS scheme uses the URL host for authority, and the actual target - // is in the path token, and the path (name) may have a leading slash. - return lookupPeers(ctx, strings.TrimPrefix(p.Path, "/")) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + prov := dns.NewProvider(m.logger, m.reg, dns.GolangResolverType) + if err := prov.Resolve(ctx, resolve); err != nil { + return nil, fmt.Errorf("failed to resolve bootstrap peers: %w", err) } - return parsePeer(addr) -} - -func parsePeer(raw ...string) ([]raft.Server, error) { - peers := make([]raft.Server, 0, len(raw)) - for _, str := range raw { - // The string may be "{addr}" or "{addr}/{node}". - parts := strings.SplitN(str, "/", 2) - var addr string - var node string - if len(parts) == 2 { - addr = parts[0] - node = parts[1] - } else { - addr = str - } - host, _, err := net.SplitHostPort(addr) - if err != nil { - return nil, err - } - if node == "" { - node = host - } + for _, peer := range prov.Addresses() { peers = append(peers, raft.Server{ Suffrage: raft.Voter, - ID: raft.ServerID(node), - Address: raft.ServerAddress(addr), + ID: raft.ServerID(peer), + Address: raft.ServerAddress(peer), }) } + // Finally, we sort the peers: the first one is to boostrap the cluster. + slices.SortFunc(peers, func(a, b raft.Server) int { + return strings.Compare(string(a.ID), string(b.ID)) + }) return peers, nil } -func lookupPeers(ctx context.Context, addr string) ([]raft.Server, error) { - host, port, err := net.SplitHostPort(addr) +func parsePeer(raw string) raft.Server { + // The string may be "{addr}" or "{addr}/{node_id}". + parts := strings.SplitN(raw, "/", 2) + var addr string + var node string + if len(parts) == 2 { + addr = parts[0] + node = parts[1] + } else { + addr = raw + } + host, _, err := net.SplitHostPort(addr) if err != nil { - host, port = addr, "" + // No port specified. + host = addr } - _, recs, err := net.DefaultResolver.LookupSRV(ctx, "", "", host) - if len(recs) == 0 && err != nil { - return nil, err + if node == "" { + // No node_id specified. + node = host } - var peers []raft.Server - for _, r := range recs { - // The SRV record may have a port, but we prefer the one from the URL. - rPort := port - if rPort == "" { - rPort = strconv.Itoa(int(r.Port)) - } - peers = append(peers, raft.Server{ - Suffrage: raft.Voter, - ID: raft.ServerID(r.Target), - Address: raft.ServerAddress(net.JoinHostPort(r.Target, rPort)), - }) + return raft.Server{ + Suffrage: raft.Voter, + ID: raft.ServerID(node), + Address: raft.ServerAddress(addr), } - return peers, nil } diff --git a/pkg/metastore/metastore_test.go b/pkg/metastore/metastore_test.go deleted file mode 100644 index 208dadb3e5..0000000000 --- a/pkg/metastore/metastore_test.go +++ /dev/null @@ -1,90 +0,0 @@ -package metastore - -import ( - "context" - "math" - "os" - "path/filepath" - "strconv" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - metastorev1 "github.com/grafana/pyroscope/api/gen/proto/go/metastore/v1" -) - -func Test_Metastore_(t *testing.T) { - peers := []string{ - "localhost:9100/localhost-0", - // "localhost:9101/localhost-1", - // "localhost:9102/localhost-2", - } - - m := newReplica(t, 0, "./testdata/", peers) - time.Sleep(2 * time.Second) - - _, err := m.AddBlock(context.Background(), &metastorev1.AddBlockRequest{ - Block: &metastorev1.BlockMeta{ - Id: "my-block-id-1", - Shard: 0xF0F0, - TenantServices: []*metastorev1.TenantService{ - {TenantId: "a-tenant", Name: "svc-1"}, - }, - }, - }) - assert.NoError(t, err) - - _, err = m.AddBlock(context.Background(), &metastorev1.AddBlockRequest{ - Block: &metastorev1.BlockMeta{ - Id: "my-block-id-2", - Shard: 0xF0F1, - TenantServices: []*metastorev1.TenantService{ - {TenantId: "a-tenant", Name: "svc-1"}, - }, - }, - }) - assert.NoError(t, err) - - t.Log(m.ListBlocksForQuery(context.Background(), &metastorev1.ListBlocksForQueryRequest{ - TenantId: []string{"a-tenant"}, - EndTime: math.MaxInt64, - Query: "{}", - })) - - t.Log("Shutdown") - assert.NoError(t, m.raft.Snapshot().Error()) - assert.NoError(t, m.Shutdown()) - - m = newReplica(t, 0, "./testdata/", peers) - time.Sleep(2 * time.Second) - t.Log(m.ListBlocksForQuery(context.Background(), &metastorev1.ListBlocksForQueryRequest{ - TenantId: []string{"a-tenant"}, - EndTime: math.MaxInt64, - Query: "{}", - })) - assert.NoError(t, m.Shutdown()) -} - -func newReplica(t *testing.T, i int, dir string, peers []string) *Metastore { - dir = filepath.Join(dir, strconv.Itoa(i)) - config := Config{ - DataDir: filepath.Join(dir, "data"), - Raft: RaftConfig{ - BootstrapPeers: peers, - BindAddress: ":910" + strconv.Itoa(i), - AdvertiseAddress: "localhost:910" + strconv.Itoa(i), - ServerID: "localhost-" + strconv.Itoa(i), - Dir: filepath.Join(dir, "raft"), - ApplyTimeout: 5 * time.Second, - }, - } - - logger := log.NewLogfmtLogger(os.Stdout) - m, err := New(config, nil, logger, nil) - require.NoError(t, err) - require.NoError(t, m.starting(context.Background())) - return m -} diff --git a/pkg/metastore/raftleader/raftleader.go b/pkg/metastore/raftleader/raftleader.go new file mode 100644 index 0000000000..2a18d5ba7c --- /dev/null +++ b/pkg/metastore/raftleader/raftleader.go @@ -0,0 +1,109 @@ +package raftleader + +import ( + "sync" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/hashicorp/raft" + "google.golang.org/grpc/health/grpc_health_v1" + + "github.com/grafana/pyroscope/pkg/util/health" +) + +type HealthObserver struct { + server health.Service + logger log.Logger + mu sync.Mutex + registered map[serviceKey]*raftService +} + +func NewRaftLeaderHealthObserver(hs health.Service, logger log.Logger) *HealthObserver { + return &HealthObserver{ + server: hs, + logger: logger, + registered: make(map[serviceKey]*raftService), + } +} + +func (hs *HealthObserver) Register(r *raft.Raft, service string) { + hs.mu.Lock() + defer hs.mu.Unlock() + k := serviceKey{raft: r, service: service} + if _, ok := hs.registered[k]; ok { + return + } + svc := &raftService{ + server: hs.server, + logger: log.With(hs.logger, "service", service), + service: service, + raft: r, + c: make(chan raft.Observation, 1), + stop: make(chan struct{}), + done: make(chan struct{}), + } + _ = level.Debug(svc.logger).Log("msg", "registering health check") + svc.updateStatus() + go svc.run() + svc.observer = raft.NewObserver(svc.c, true, func(o *raft.Observation) bool { + _, ok := o.Data.(raft.LeaderObservation) + return ok + }) + r.RegisterObserver(svc.observer) + hs.registered[k] = svc +} + +func (hs *HealthObserver) Deregister(r *raft.Raft, service string) { + hs.mu.Lock() + k := serviceKey{raft: r, service: service} + svc, ok := hs.registered[k] + delete(hs.registered, k) + hs.mu.Unlock() + if ok { + close(svc.stop) + <-svc.done + } +} + +type serviceKey struct { + raft *raft.Raft + service string +} + +type raftService struct { + server health.Service + logger log.Logger + service string + raft *raft.Raft + observer *raft.Observer + c chan raft.Observation + stop chan struct{} + done chan struct{} +} + +func (svc *raftService) run() { + defer func() { + close(svc.done) + }() + for { + select { + case <-svc.c: + svc.updateStatus() + case <-svc.stop: + _ = level.Debug(svc.logger).Log("msg", "deregistering health check") + // We explicitly remove the service from serving when we stop observing it. + svc.server.SetServingStatus(svc.service, grpc_health_v1.HealthCheckResponse_NOT_SERVING) + svc.raft.DeregisterObserver(svc.observer) + return + } + } +} + +func (svc *raftService) updateStatus() { + status := grpc_health_v1.HealthCheckResponse_NOT_SERVING + if svc.raft.State() == raft.Leader { + status = grpc_health_v1.HealthCheckResponse_SERVING + } + _ = level.Info(svc.logger).Log("msg", "updating health status", "status", status) + svc.server.SetServingStatus(svc.service, status) +} diff --git a/pkg/phlare/modules.go b/pkg/phlare/modules.go index ee5899d35c..a921cbf8c6 100644 --- a/pkg/phlare/modules.go +++ b/pkg/phlare/modules.go @@ -28,6 +28,7 @@ import ( "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" "google.golang.org/genproto/googleapis/api/httpbody" + "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/protobuf/encoding/protojson" "gopkg.in/yaml.v3" @@ -52,6 +53,7 @@ import ( "github.com/grafana/pyroscope/pkg/usagestats" "github.com/grafana/pyroscope/pkg/util" "github.com/grafana/pyroscope/pkg/util/build" + "github.com/grafana/pyroscope/pkg/util/health" "github.com/grafana/pyroscope/pkg/validation" "github.com/grafana/pyroscope/pkg/validation/exporter" ) @@ -104,7 +106,7 @@ func (f *Phlare) initQueryFrontend() (services.Service, error) { f.Cfg.Frontend.Port = f.Cfg.Server.HTTPListenPort } - frontendSvc, err := frontend.NewFrontend(f.Cfg.Frontend, f.Overrides, log.With(f.logger, "component", "frontend"), f.reg, f.MetastoreClientConn) + frontendSvc, err := frontend.NewFrontend(f.Cfg.Frontend, f.Overrides, log.With(f.logger, "component", "frontend"), f.reg, f.MetastoreClient) if err != nil { return nil, err } @@ -463,6 +465,10 @@ func (f *Phlare) initServer() (services.Service, error) { f.Server = serv + healthService := health.NewGRPCHealthService() + grpc_health_v1.RegisterHealthServer(f.Server.GRPC, healthService) + f.health = healthService + servicesToWaitFor := func() []services.Service { svs := []services.Service(nil) for m, s := range f.serviceMap { @@ -554,7 +560,7 @@ func (f *Phlare) initAdmin() (services.Service, error) { } func (f *Phlare) initMetastore() (services.Service, error) { - m, err := metastore.New(f.Cfg.Metastore, nil, f.logger, f.reg) + m, err := metastore.New(f.Cfg.Metastore, nil, log.With(f.logger, "component", "metastore"), f.reg, f.health) if err != nil { return nil, err } @@ -563,16 +569,12 @@ func (f *Phlare) initMetastore() (services.Service, error) { } func (f *Phlare) initMetastoreClient() (services.Service, error) { - cc, err := metastoreclient.Dial(f.Cfg.MetastoreClient) + mc, err := metastoreclient.New(f.Cfg.MetastoreClient) if err != nil { return nil, err } - f.MetastoreClientConn = cc - svc := services.NewIdleService( - func(_ context.Context) error { return nil }, - func(_ error) error { return cc.Close() }, - ) - return svc, nil + f.MetastoreClient = mc + return mc.Service(), nil } type statusService struct { diff --git a/pkg/phlare/phlare.go b/pkg/phlare/phlare.go index 28b9f7da4b..f2c8b65672 100644 --- a/pkg/phlare/phlare.go +++ b/pkg/phlare/phlare.go @@ -36,7 +36,6 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/version" "github.com/samber/lo" - "google.golang.org/grpc" "github.com/grafana/pyroscope/pkg/api" apiversion "github.com/grafana/pyroscope/pkg/api/version" @@ -62,6 +61,7 @@ import ( "github.com/grafana/pyroscope/pkg/usagestats" "github.com/grafana/pyroscope/pkg/util" "github.com/grafana/pyroscope/pkg/util/cli" + "github.com/grafana/pyroscope/pkg/util/health" "github.com/grafana/pyroscope/pkg/validation" "github.com/grafana/pyroscope/pkg/validation/exporter" ) @@ -222,25 +222,25 @@ type Phlare struct { logger log.Logger reg prometheus.Registerer tracer io.Closer + health health.Service ModuleManager *modules.Manager serviceMap map[string]services.Service deps map[string][]string - API *api.API - Server *server.Server - SignalHandler *signals.Handler - MemberlistKV *memberlist.KVInitService - ring *ring.Ring - usageReport *usagestats.Reporter - RuntimeConfig *runtimeconfig.Manager - Overrides *validation.Overrides - Compactor *compactor.MultitenantCompactor - admin *operations.Admin - versions *apiversion.Service - serviceManager *services.Manager - - MetastoreClientConn *grpc.ClientConn + API *api.API + Server *server.Server + SignalHandler *signals.Handler + MemberlistKV *memberlist.KVInitService + ring *ring.Ring + MetastoreClient *metastoreclient.Client + usageReport *usagestats.Reporter + RuntimeConfig *runtimeconfig.Manager + Overrides *validation.Overrides + Compactor *compactor.MultitenantCompactor + admin *operations.Admin + versions *apiversion.Service + serviceManager *services.Manager TenantLimits validation.TenantLimits @@ -260,6 +260,7 @@ func New(cfg Config) (*Phlare, error) { Cfg: cfg, logger: logger, reg: prometheus.DefaultRegisterer, + health: health.NoOpService, } if err := cfg.Validate(); err != nil { return nil, err @@ -316,7 +317,7 @@ func (f *Phlare) setupModuleManager() error { mm.RegisterModule(TenantSettings, f.initTenantSettings) mm.RegisterModule(AdHocProfiles, f.initAdHocProfiles) mm.RegisterModule(Metastore, f.initMetastore) - mm.RegisterModule(MetastoreClient, f.initMetastoreClient) + mm.RegisterModule(MetastoreClient, f.initMetastoreClient, modules.UserInvisibleModule) // Add dependencies deps := map[string][]string{ diff --git a/pkg/util/health/health.go b/pkg/util/health/health.go new file mode 100644 index 0000000000..675596e1bf --- /dev/null +++ b/pkg/util/health/health.go @@ -0,0 +1,33 @@ +package health + +import ( + "github.com/grafana/dskit/services" + "google.golang.org/grpc/health" + "google.golang.org/grpc/health/grpc_health_v1" +) + +type Service interface { + SetServingStatus(string, grpc_health_v1.HealthCheckResponse_ServingStatus) +} + +type noopService struct{} + +var NoOpService = noopService{} + +func (noopService) SetServingStatus(string, grpc_health_v1.HealthCheckResponse_ServingStatus) {} + +func NewGRPCHealthService() *GRPCHealthService { + s := health.NewServer() + return &GRPCHealthService{ + Server: s, + Service: services.NewIdleService(nil, func(error) error { + s.Shutdown() + return nil + }), + } +} + +type GRPCHealthService struct { + services.Service + *health.Server +}