From 9a9a242c7f14213b0d5df14584a0ed1c7c4d1181 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 24 Sep 2024 10:50:16 +0530 Subject: [PATCH 1/3] feat: add metric to show the count of errant GTIDs Signed-off-by: Manan Gupta --- go/mysql/replication/mysql56_gtid_set.go | 15 ++++++ go/mysql/replication/mysql56_gtid_set_test.go | 46 +++++++++++++++++++ go/vt/vtorc/inst/instance_dao.go | 9 ++++ 3 files changed, 70 insertions(+) diff --git a/go/mysql/replication/mysql56_gtid_set.go b/go/mysql/replication/mysql56_gtid_set.go index 348af5b5274..b11318bfa4d 100644 --- a/go/mysql/replication/mysql56_gtid_set.go +++ b/go/mysql/replication/mysql56_gtid_set.go @@ -688,3 +688,18 @@ func Subtract(lhs, rhs string) (string, error) { diffSet := lhsSet.Difference(rhsSet) return diffSet.String(), nil } + +// GTIDCount returns the number of GTIDs in a GTID set. +func GTIDCount(gtidStr string) (int64, error) { + gtidSet, err := ParseMysql56GTIDSet(gtidStr) + if err != nil { + return 0, err + } + var count int64 + for _, intervals := range gtidSet { + for _, intvl := range intervals { + count = count + intvl.end - intvl.start + 1 + } + } + return count, nil +} diff --git a/go/mysql/replication/mysql56_gtid_set_test.go b/go/mysql/replication/mysql56_gtid_set_test.go index 323baae3885..e5accc46120 100644 --- a/go/mysql/replication/mysql56_gtid_set_test.go +++ b/go/mysql/replication/mysql56_gtid_set_test.go @@ -704,3 +704,49 @@ func BenchmarkMySQL56GTIDParsing(b *testing.B) { } } } + +func TestGTIDCount(t *testing.T) { + tests := []struct { + name string + gtidStr string + wantCount int64 + wantErr string + }{ + { + name: "Empty GTID String", + gtidStr: "", + wantCount: 0, + }, { + name: "Single GTID", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:12", + wantCount: 1, + }, { + name: "Single GTID Interval", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5", + wantCount: 5, + }, { + name: "Single UUID", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5:11-20", + wantCount: 15, + }, { + name: "Multiple UUIDs", + gtidStr: "00010203-0405-0607-0809-0a0b0c0d0e0f:1-5:10-20,00010203-0405-0607-0809-0a0b0c0d0eff:1-5:50", + wantCount: 22, + }, { + name: "Parsing error", + gtidStr: "incorrect set", + wantErr: "invalid MySQL 5.6 GTID set", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + count, err := GTIDCount(tt.gtidStr) + require.EqualValues(t, tt.wantCount, count) + if tt.wantErr != "" { + require.ErrorContains(t, err, tt.wantErr) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go index 4e401606a95..0fafa245321 100644 --- a/go/vt/vtorc/inst/instance_dao.go +++ b/go/vt/vtorc/inst/instance_dao.go @@ -61,6 +61,7 @@ var forgetAliases *cache.Cache var ( readTopologyInstanceCounter = stats.NewCounter("InstanceReadTopology", "Number of times an instance was read from the topology") readInstanceCounter = stats.NewCounter("InstanceRead", "Number of times an instance was read") + errantGTIDCounts = stats.NewGaugesWithSingleLabel("ErrantGTIDCounts", "Number of errant GTIDs in a vttablet", "TabletAlias") backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES") writeBufferLatency = stopwatch.NewNamedStopwatch() ) @@ -378,6 +379,11 @@ Cleanup: redactedPrimaryExecutedGtidSet.RemoveUUID(instance.SourceUUID) instance.GtidErrant, err = replication.Subtract(redactedExecutedGtidSet.String(), redactedPrimaryExecutedGtidSet.String()) + if err == nil { + var gtidCount int64 + gtidCount, err = replication.GTIDCount(instance.GtidErrant) + errantGTIDCounts.Set(tabletAlias, gtidCount) + } } } } @@ -1036,6 +1042,9 @@ func ForgetInstance(tabletAlias string) error { forgetAliases.Set(tabletAlias, true, cache.DefaultExpiration) log.Infof("Forgetting: %v", tabletAlias) + // Remove this tablet from errant GTID count metric. + errantGTIDCounts.Reset(tabletAlias) + // Delete from the 'vitess_tablet' table. _, err := db.ExecVTOrc(` delete From 8adfbcff0c958a7c1b360146ee1f50aa4ae21460 Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 24 Sep 2024 11:12:55 +0530 Subject: [PATCH 2/3] test: add test for errant gtid counts Signed-off-by: Manan Gupta --- go/test/endtoend/vtorc/api/api_test.go | 15 +++++++++++++-- go/test/endtoend/vtorc/utils/utils.go | 20 ++++++++++---------- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/go/test/endtoend/vtorc/api/api_test.go b/go/test/endtoend/vtorc/api/api_test.go index 174ee5ea914..0d78571f893 100644 --- a/go/test/endtoend/vtorc/api/api_test.go +++ b/go/test/endtoend/vtorc/api/api_test.go @@ -268,11 +268,13 @@ func TestAPIEndpoints(t *testing.T) { assert.Equal(t, "Filtering by shard without keyspace isn't supported\n", resp) // Also verify that the metric for errant GTIDs is reporting the correct count. - waitForErrantGTIDCount(t, vtorc, 1) + waitForErrantGTIDTabletCount(t, vtorc, 1) + // Now we check the errant GTID count for the tablet + verifyErrantGTIDCount(t, vtorc, replica.Alias, 1) }) } -func waitForErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTIDCountWanted int) { +func waitForErrantGTIDTabletCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTIDCountWanted int) { timeout := time.After(15 * time.Second) for { select { @@ -293,3 +295,12 @@ func waitForErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, errantGTI } } } + +func verifyErrantGTIDCount(t *testing.T, vtorc *cluster.VTOrcProcess, tabletAlias string, countWanted int) { + vars := vtorc.GetVars() + errantGTIDCounts := vars["ErrantGTIDCounts"].(map[string]interface{}) + gtidCountVal, isPresent := errantGTIDCounts[tabletAlias] + require.True(t, isPresent, "Tablet %s not found in errant GTID counts", tabletAlias) + gtidCount := utils.GetIntFromValue(gtidCountVal) + require.EqualValues(t, countWanted, gtidCount, "Tablet %s has %d errant GTIDs, wanted %d", tabletAlias, gtidCount, countWanted) +} diff --git a/go/test/endtoend/vtorc/utils/utils.go b/go/test/endtoend/vtorc/utils/utils.go index 63500377f47..a4e3be3486e 100644 --- a/go/test/endtoend/vtorc/utils/utils.go +++ b/go/test/endtoend/vtorc/utils/utils.go @@ -998,7 +998,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName]) if successCount == countExpected { return } @@ -1006,7 +1006,7 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr } vars := vtorcInstance.GetVars() successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{}) - successCount := getIntFromValue(successfulRecoveriesMap[recoveryName]) + successCount := GetIntFromValue(successfulRecoveriesMap[recoveryName]) assert.EqualValues(t, countExpected, successCount) } @@ -1019,7 +1019,7 @@ func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() prsCountsMap := vars["PlannedReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(prsCountsMap[mapKey]) + successCount := GetIntFromValue(prsCountsMap[mapKey]) if successCount == countExpected { return } @@ -1027,7 +1027,7 @@ func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess } vars := vtorcInstance.GetVars() prsCountsMap := vars["PlannedReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(prsCountsMap[mapKey]) + successCount := GetIntFromValue(prsCountsMap[mapKey]) assert.EqualValues(t, countExpected, successCount) } @@ -1040,7 +1040,7 @@ func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() ersCountsMap := vars["EmergencyReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(ersCountsMap[mapKey]) + successCount := GetIntFromValue(ersCountsMap[mapKey]) if successCount == countExpected { return } @@ -1048,7 +1048,7 @@ func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess } vars := vtorcInstance.GetVars() ersCountsMap := vars["EmergencyReparentCounts"].(map[string]interface{}) - successCount := getIntFromValue(ersCountsMap[mapKey]) + successCount := GetIntFromValue(ersCountsMap[mapKey]) assert.EqualValues(t, countExpected, successCount) } @@ -1067,10 +1067,10 @@ func CheckMetricExists(t *testing.T, vtorcInstance *cluster.VTOrcProcess, metric assert.Contains(t, metrics, metricName) } -// getIntFromValue is a helper function to get an integer from the given value. +// GetIntFromValue is a helper function to get an integer from the given value. // If it is convertible to a float, then we round the number to the nearest integer. // If the value is not numeric at all, we return 0. -func getIntFromValue(val any) int { +func GetIntFromValue(val any) int { value := reflect.ValueOf(val) if value.CanFloat() { return int(math.Round(value.Float())) @@ -1091,7 +1091,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, for time.Since(startTime) < timeout { vars := vtorcInstance.GetVars() problems := vars["DetectedProblems"].(map[string]interface{}) - actual := getIntFromValue(problems[key]) + actual := GetIntFromValue(problems[key]) if actual == expect { return } @@ -1101,7 +1101,7 @@ func WaitForDetectedProblems(t *testing.T, vtorcInstance *cluster.VTOrcProcess, vars := vtorcInstance.GetVars() problems := vars["DetectedProblems"].(map[string]interface{}) actual, ok := problems[key] - actual = getIntFromValue(actual) + actual = GetIntFromValue(actual) assert.True(t, ok, "The metric DetectedProblems[%s] should exist but does not (all problems: %+v)", From bd70d2d5ed7ed018abb4177c4bc3f7c42f3548fb Mon Sep 17 00:00:00 2001 From: Manan Gupta Date: Tue, 24 Sep 2024 11:21:37 +0530 Subject: [PATCH 3/3] feat: add a small summary section Signed-off-by: Manan Gupta --- changelog/21.0/21.0.0/summary.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/changelog/21.0/21.0.0/summary.md b/changelog/21.0/21.0.0/summary.md index a9810194514..e677db9e577 100644 --- a/changelog/21.0/21.0.0/summary.md +++ b/changelog/21.0/21.0.0/summary.md @@ -16,6 +16,7 @@ - **[VTGate Tablet Balancer](#tablet-balancer)** - **[Query Timeout Override](#query-timeout)** - **[Dynamic VReplication Configuration](#dynamic-vreplication-configuration)** + - **[Errant GTIDs Count Metric](#errant-gtid-metric)** ## Major Changes @@ -137,3 +138,6 @@ Currently many of the configuration options for VReplication Workflows are vttab requires restarts of vttablets. We now allow these to be overridden while creating a workflow or dynamically once the workflow is in progress. See https://github.com/vitessio/vitess/pull/16583 for details. +### Errant GTIDs Count Metric +A new metric called `ErrantGTIDCounts` has been added to the `VTOrc` component. +This metric shows the count of the errant GTIDs in the tablets.