From 8f69eb8f69d929e6ca6f49899b889f0288c05bd8 Mon Sep 17 00:00:00 2001 From: Rawlin Peters Date: Wed, 21 Jul 2021 14:11:14 -0600 Subject: [PATCH] TM: do not populate delivery service disabled locations The disabledLocations array is an unnecessary optimization for Traffic Router, and it isn't always populated properly in certain cases, leading to service outages. Therefore, never populate it, but keep the field around for compatibility purposes. --- CHANGELOG.md | 1 + traffic_monitor/datareq/datareq_test.go | 8 +--- traffic_monitor/ds/stat.go | 9 +---- traffic_monitor/health/cache.go | 51 +++--------------------- traffic_monitor/health/cache_test.go | 11 +++++ traffic_monitor/manager/statecombiner.go | 18 --------- 6 files changed, 20 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index efcd655521..a8ca05b54a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - [#2471](https://github.com/apache/trafficcontrol/issues/2471) - A PR check to ensure added db migration file is the latest. - [#5609](https://github.com/apache/trafficcontrol/issues/5609) - Fixed GET /servercheck filter for an extra query param. - [#5954](https://github.com/apache/trafficcontrol/issues/5954) - Traffic Ops HTTP response write errors are ignored +- [#6048](https://github.com/apache/trafficcontrol/issues/6048) - TM sometimes sets cachegroups to unavailable even though all caches are online - [#5288](https://github.com/apache/trafficcontrol/issues/5288) - Fixed the ability to create and update a server with MTU value >= 1280. - [#5284](https://github.com/apache/trafficcontrol/issues/5284) - Fixed error message when creating a server with non-existent profile - Fixed a NullPointerException in TR when a client passes a null SNI hostname in a TLS request diff --git a/traffic_monitor/datareq/datareq_test.go b/traffic_monitor/datareq/datareq_test.go index 6adfe116e9..7ed98a2d1d 100644 --- a/traffic_monitor/datareq/datareq_test.go +++ b/traffic_monitor/datareq/datareq_test.go @@ -79,14 +79,8 @@ func getMockLastHealthTimes() map[tc.CacheName]time.Duration { } func getMockCRStatesDeliveryService() tc.CRStatesDeliveryService { - numCGs := 10 - disabledLocations := []tc.CacheGroupName{} - for i := 0; i < numCGs; i++ { - disabledLocations = append(disabledLocations, tc.CacheGroupName(randStr())) - } - return tc.CRStatesDeliveryService{ - DisabledLocations: disabledLocations, + DisabledLocations: []tc.CacheGroupName{}, IsAvailable: randBool(), } } diff --git a/traffic_monitor/ds/stat.go b/traffic_monitor/ds/stat.go index 3b6076a9e0..56c88591a3 100644 --- a/traffic_monitor/ds/stat.go +++ b/traffic_monitor/ds/stat.go @@ -130,18 +130,13 @@ func addAvailableData(dsStats *dsdata.Stats, crStates tc.CRStates, serverCachegr } // TODO move to its own func? - for dsName, ds := range crStates.DeliveryService { + for dsName := range crStates.DeliveryService { stat, ok := dsStats.DeliveryService[dsName] if !ok { log.Infof("CreateStats not adding disabledLocations for '%s': not found in Stats\n", dsName) continue // TODO log warning? Error? } - - // TODO determine if a deep copy is necessary - stat.CommonStats.CachesDisabled = make([]string, len(ds.DisabledLocations), len(ds.DisabledLocations)) - for i, v := range ds.DisabledLocations { - stat.CommonStats.CachesDisabled[i] = string(v) - } + stat.CommonStats.CachesDisabled = make([]string, 0) } } diff --git a/traffic_monitor/health/cache.go b/traffic_monitor/health/cache.go index efcab8f4b0..30889d51a3 100644 --- a/traffic_monitor/health/cache.go +++ b/traffic_monitor/health/cache.go @@ -385,15 +385,10 @@ func CalcAvailability( localCacheStatuses[result.ID] = availStatus } - calculateDeliveryServiceState(toData.DeliveryServiceServers, localStates, toData) + calculateDeliveryServiceState(toData.DeliveryServiceServers, localStates) localCacheStatusThreadsafe.Set(localCacheStatuses) } -func setErr(newResult *cache.Result, err error) { - newResult.Error = err - newResult.Available = false -} - // ExceedsThresholdMsg returns a human-readable message for why the given value exceeds the threshold. It does NOT check whether the value actually exceeds the threshold; call `InThreshold` to check first. func exceedsThresholdMsg(stat string, threshold tc.HealthThreshold, val float64) string { switch threshold.Comparator { @@ -435,52 +430,16 @@ func eventDesc(status tc.CacheStatus, message string) string { } //calculateDeliveryServiceState calculates the state of delivery services from the new cache state data `cacheState` and the CRConfig data `deliveryServiceServers` and puts the calculated state in the outparam `deliveryServiceStates` -func calculateDeliveryServiceState(deliveryServiceServers map[tc.DeliveryServiceName][]tc.CacheName, states peer.CRStatesThreadsafe, toData todata.TOData) { - cacheStates := states.GetCaches() - +func calculateDeliveryServiceState(deliveryServiceServers map[tc.DeliveryServiceName][]tc.CacheName, states peer.CRStatesThreadsafe) { deliveryServices := states.GetDeliveryServices() for deliveryServiceName, deliveryServiceState := range deliveryServices { if _, ok := deliveryServiceServers[deliveryServiceName]; !ok { log.Infof("CRConfig does not have delivery service %s, but traffic monitor poller does; skipping\n", deliveryServiceName) continue } - deliveryServiceState.DisabledLocations = getDisabledLocations(deliveryServiceName, toData.DeliveryServiceServers[deliveryServiceName], cacheStates, toData.ServerCachegroups) + // NOTE: DisabledLocations is always empty, and it's important that it isn't nil, so it serialises to the JSON `[]` instead of `null`. + // It's no longer populated due to it being an unnecessary optimization for Traffic Router, but the field is kept for compatibility. + deliveryServiceState.DisabledLocations = []tc.CacheGroupName{} states.SetDeliveryService(deliveryServiceName, deliveryServiceState) } } - -func getDisabledLocations(deliveryService tc.DeliveryServiceName, deliveryServiceServers []tc.CacheName, cacheStates map[tc.CacheName]tc.IsAvailable, serverCacheGroups map[tc.CacheName]tc.CacheGroupName) []tc.CacheGroupName { - disabledLocations := []tc.CacheGroupName{} // it's important this isn't nil, so it serialises to the JSON `[]` instead of `null` - dsCacheStates := getDeliveryServiceCacheAvailability(cacheStates, deliveryServiceServers) - dsCachegroupsAvailable := getDeliveryServiceCachegroupAvailability(dsCacheStates, serverCacheGroups) - for cg, avail := range dsCachegroupsAvailable { - if avail { - continue - } - disabledLocations = append(disabledLocations, cg) - } - return disabledLocations -} - -func getDeliveryServiceCacheAvailability(cacheStates map[tc.CacheName]tc.IsAvailable, deliveryServiceServers []tc.CacheName) map[tc.CacheName]tc.IsAvailable { - dsCacheStates := map[tc.CacheName]tc.IsAvailable{} - for _, server := range deliveryServiceServers { - dsCacheStates[server] = cacheStates[tc.CacheName(server)] - } - return dsCacheStates -} - -func getDeliveryServiceCachegroupAvailability(dsCacheStates map[tc.CacheName]tc.IsAvailable, serverCachegroups map[tc.CacheName]tc.CacheGroupName) map[tc.CacheGroupName]bool { - cgAvail := map[tc.CacheGroupName]bool{} - for cache, available := range dsCacheStates { - cg, ok := serverCachegroups[cache] - if !ok { - log.Errorf("cache %v not found in cachegroups!\n", cache) - continue - } - if _, ok := cgAvail[cg]; !ok || available.IsAvailable { - cgAvail[cg] = available.IsAvailable - } - } - return cgAvail -} diff --git a/traffic_monitor/health/cache_test.go b/traffic_monitor/health/cache_test.go index f87f7443d0..013847b8ff 100644 --- a/traffic_monitor/health/cache_test.go +++ b/traffic_monitor/health/cache_test.go @@ -417,6 +417,7 @@ func TestCalcAvailabilityThresholds(t *testing.T) { localCacheStatusThreadsafe := threadsafe.NewCacheAvailableStatus() localStates := peer.NewCRStatesThreadsafe() + localStates.SetDeliveryService("myDS", tc.CRStatesDeliveryService{}) events := NewThreadsafeEvents(200) // test that a normal stat poll over the kbps threshold marks down @@ -433,6 +434,16 @@ func TestCalcAvailabilityThresholds(t *testing.T) { CalcAvailability(results, pollerName, statResultHistory, mc, toData, localCacheStatusThreadsafe, localStates, events, config.Both) + // ensure that the DisabledLocations is an empty, non-nil slice + for _, ds := range localStates.GetDeliveryServices() { + if ds.DisabledLocations == nil { + t.Error("expected: non-nil DisabledLocations, actual: nil") + } + if len(ds.DisabledLocations) > 0 { + t.Errorf("expected: empty DisabledLocations, actual: %d", len(ds.DisabledLocations)) + } + } + localCacheStatuses := localCacheStatusThreadsafe.Get() localCacheStatus, ok := localCacheStatuses[result.ID] if !ok { diff --git a/traffic_monitor/manager/statecombiner.go b/traffic_monitor/manager/statecombiner.go index 694913942f..922b69689f 100644 --- a/traffic_monitor/manager/statecombiner.go +++ b/traffic_monitor/manager/statecombiner.go @@ -21,7 +21,6 @@ package manager import ( "fmt" - "sort" "strings" "time" @@ -148,7 +147,6 @@ func combineDSState( if localDeliveryService.IsAvailable { deliveryService.IsAvailable = true } - deliveryService.DisabledLocations = localDeliveryService.DisabledLocations for peerName, iPeerStates := range peerStates.GetCrstates() { peerDeliveryService, ok := iPeerStates.DeliveryService[deliveryServiceName] @@ -159,7 +157,6 @@ func combineDSState( if peerDeliveryService.IsAvailable { deliveryService.IsAvailable = true } - deliveryService.DisabledLocations = intersection(deliveryService.DisabledLocations, peerDeliveryService.DisabledLocations) } combinedStates.SetDeliveryService(deliveryServiceName, deliveryService) } @@ -218,18 +215,3 @@ type CacheGroupNameSlice []tc.CacheGroupName func (p CacheGroupNameSlice) Len() int { return len(p) } func (p CacheGroupNameSlice) Less(i, j int) bool { return p[i] < p[j] } func (p CacheGroupNameSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - -// intersection returns strings in both a and b. -// Note this modifies a and b. Specifically, it sorts them. If that isn't acceptable, pass copies of your real data. -func intersection(a []tc.CacheGroupName, b []tc.CacheGroupName) []tc.CacheGroupName { - sort.Sort(CacheGroupNameSlice(a)) - sort.Sort(CacheGroupNameSlice(b)) - c := []tc.CacheGroupName{} // important to initialize, so JSON is `[]` not `null` - for _, s := range a { - i := sort.Search(len(b), func(i int) bool { return b[i] >= s }) - if i < len(b) && b[i] == s { - c = append(c, s) - } - } - return c -}