Skip to content

Commit

Permalink
TM: do not populate delivery service disabled locations (#6051)
Browse files Browse the repository at this point in the history
The disabledLocations array is an unnecessary optimization for Traffic
Router, and it isn't always populated properly in certain cases, leading
to service outages. Therefore, never populate it, but keep the field
around for compatibility purposes.
  • Loading branch information
rawlinp authored Jul 29, 2021
1 parent 82a479d commit ed8d4af
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 78 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- [#2471](https://github.com/apache/trafficcontrol/issues/2471) - A PR check to ensure added db migration file is the latest.
- [#5609](https://github.com/apache/trafficcontrol/issues/5609) - Fixed GET /servercheck filter for an extra query param.
- [#5954](https://github.com/apache/trafficcontrol/issues/5954) - Traffic Ops HTTP response write errors are ignored
- [#6048](https://github.com/apache/trafficcontrol/issues/6048) - TM sometimes sets cachegroups to unavailable even though all caches are online
- [#5288](https://github.com/apache/trafficcontrol/issues/5288) - Fixed the ability to create and update a server with MTU value >= 1280.
- [#5284](https://github.com/apache/trafficcontrol/issues/5284) - Fixed error message when creating a server with non-existent profile
- Fixed a NullPointerException in TR when a client passes a null SNI hostname in a TLS request
Expand Down
8 changes: 1 addition & 7 deletions traffic_monitor/datareq/datareq_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,8 @@ func getMockLastHealthTimes() map[tc.CacheName]time.Duration {
}

func getMockCRStatesDeliveryService() tc.CRStatesDeliveryService {
numCGs := 10
disabledLocations := []tc.CacheGroupName{}
for i := 0; i < numCGs; i++ {
disabledLocations = append(disabledLocations, tc.CacheGroupName(randStr()))
}

return tc.CRStatesDeliveryService{
DisabledLocations: disabledLocations,
DisabledLocations: []tc.CacheGroupName{},
IsAvailable: randBool(),
}
}
Expand Down
9 changes: 2 additions & 7 deletions traffic_monitor/ds/stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,18 +130,13 @@ func addAvailableData(dsStats *dsdata.Stats, crStates tc.CRStates, serverCachegr
}

// TODO move to its own func?
for dsName, ds := range crStates.DeliveryService {
for dsName := range crStates.DeliveryService {
stat, ok := dsStats.DeliveryService[dsName]
if !ok {
log.Infof("CreateStats not adding disabledLocations for '%s': not found in Stats\n", dsName)
continue // TODO log warning? Error?
}

// TODO determine if a deep copy is necessary
stat.CommonStats.CachesDisabled = make([]string, len(ds.DisabledLocations), len(ds.DisabledLocations))
for i, v := range ds.DisabledLocations {
stat.CommonStats.CachesDisabled[i] = string(v)
}
stat.CommonStats.CachesDisabled = make([]string, 0)
}
}

Expand Down
51 changes: 5 additions & 46 deletions traffic_monitor/health/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -385,15 +385,10 @@ func CalcAvailability(

localCacheStatuses[result.ID] = availStatus
}
calculateDeliveryServiceState(toData.DeliveryServiceServers, localStates, toData)
calculateDeliveryServiceState(toData.DeliveryServiceServers, localStates)
localCacheStatusThreadsafe.Set(localCacheStatuses)
}

func setErr(newResult *cache.Result, err error) {
newResult.Error = err
newResult.Available = false
}

// ExceedsThresholdMsg returns a human-readable message for why the given value exceeds the threshold. It does NOT check whether the value actually exceeds the threshold; call `InThreshold` to check first.
func exceedsThresholdMsg(stat string, threshold tc.HealthThreshold, val float64) string {
switch threshold.Comparator {
Expand Down Expand Up @@ -435,52 +430,16 @@ func eventDesc(status tc.CacheStatus, message string) string {
}

//calculateDeliveryServiceState calculates the state of delivery services from the new cache state data `cacheState` and the CRConfig data `deliveryServiceServers` and puts the calculated state in the outparam `deliveryServiceStates`
func calculateDeliveryServiceState(deliveryServiceServers map[tc.DeliveryServiceName][]tc.CacheName, states peer.CRStatesThreadsafe, toData todata.TOData) {
cacheStates := states.GetCaches()

func calculateDeliveryServiceState(deliveryServiceServers map[tc.DeliveryServiceName][]tc.CacheName, states peer.CRStatesThreadsafe) {
deliveryServices := states.GetDeliveryServices()
for deliveryServiceName, deliveryServiceState := range deliveryServices {
if _, ok := deliveryServiceServers[deliveryServiceName]; !ok {
log.Infof("CRConfig does not have delivery service %s, but traffic monitor poller does; skipping\n", deliveryServiceName)
continue
}
deliveryServiceState.DisabledLocations = getDisabledLocations(deliveryServiceName, toData.DeliveryServiceServers[deliveryServiceName], cacheStates, toData.ServerCachegroups)
// NOTE: DisabledLocations is always empty, and it's important that it isn't nil, so it serialises to the JSON `[]` instead of `null`.
// It's no longer populated due to it being an unnecessary optimization for Traffic Router, but the field is kept for compatibility.
deliveryServiceState.DisabledLocations = []tc.CacheGroupName{}
states.SetDeliveryService(deliveryServiceName, deliveryServiceState)
}
}

func getDisabledLocations(deliveryService tc.DeliveryServiceName, deliveryServiceServers []tc.CacheName, cacheStates map[tc.CacheName]tc.IsAvailable, serverCacheGroups map[tc.CacheName]tc.CacheGroupName) []tc.CacheGroupName {
disabledLocations := []tc.CacheGroupName{} // it's important this isn't nil, so it serialises to the JSON `[]` instead of `null`
dsCacheStates := getDeliveryServiceCacheAvailability(cacheStates, deliveryServiceServers)
dsCachegroupsAvailable := getDeliveryServiceCachegroupAvailability(dsCacheStates, serverCacheGroups)
for cg, avail := range dsCachegroupsAvailable {
if avail {
continue
}
disabledLocations = append(disabledLocations, cg)
}
return disabledLocations
}

func getDeliveryServiceCacheAvailability(cacheStates map[tc.CacheName]tc.IsAvailable, deliveryServiceServers []tc.CacheName) map[tc.CacheName]tc.IsAvailable {
dsCacheStates := map[tc.CacheName]tc.IsAvailable{}
for _, server := range deliveryServiceServers {
dsCacheStates[server] = cacheStates[tc.CacheName(server)]
}
return dsCacheStates
}

func getDeliveryServiceCachegroupAvailability(dsCacheStates map[tc.CacheName]tc.IsAvailable, serverCachegroups map[tc.CacheName]tc.CacheGroupName) map[tc.CacheGroupName]bool {
cgAvail := map[tc.CacheGroupName]bool{}
for cache, available := range dsCacheStates {
cg, ok := serverCachegroups[cache]
if !ok {
log.Errorf("cache %v not found in cachegroups!\n", cache)
continue
}
if _, ok := cgAvail[cg]; !ok || available.IsAvailable {
cgAvail[cg] = available.IsAvailable
}
}
return cgAvail
}
11 changes: 11 additions & 0 deletions traffic_monitor/health/cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ func TestCalcAvailabilityThresholds(t *testing.T) {

localCacheStatusThreadsafe := threadsafe.NewCacheAvailableStatus()
localStates := peer.NewCRStatesThreadsafe()
localStates.SetDeliveryService("myDS", tc.CRStatesDeliveryService{})
events := NewThreadsafeEvents(200)

// test that a normal stat poll over the kbps threshold marks down
Expand All @@ -433,6 +434,16 @@ func TestCalcAvailabilityThresholds(t *testing.T) {

CalcAvailability(results, pollerName, statResultHistory, mc, toData, localCacheStatusThreadsafe, localStates, events, config.Both)

// ensure that the DisabledLocations is an empty, non-nil slice
for _, ds := range localStates.GetDeliveryServices() {
if ds.DisabledLocations == nil {
t.Error("expected: non-nil DisabledLocations, actual: nil")
}
if len(ds.DisabledLocations) > 0 {
t.Errorf("expected: empty DisabledLocations, actual: %d", len(ds.DisabledLocations))
}
}

localCacheStatuses := localCacheStatusThreadsafe.Get()
localCacheStatus, ok := localCacheStatuses[result.ID]
if !ok {
Expand Down
18 changes: 0 additions & 18 deletions traffic_monitor/manager/statecombiner.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ package manager

import (
"fmt"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -148,7 +147,6 @@ func combineDSState(
if localDeliveryService.IsAvailable {
deliveryService.IsAvailable = true
}
deliveryService.DisabledLocations = localDeliveryService.DisabledLocations

for peerName, iPeerStates := range peerStates.GetCrstates() {
peerDeliveryService, ok := iPeerStates.DeliveryService[deliveryServiceName]
Expand All @@ -159,7 +157,6 @@ func combineDSState(
if peerDeliveryService.IsAvailable {
deliveryService.IsAvailable = true
}
deliveryService.DisabledLocations = intersection(deliveryService.DisabledLocations, peerDeliveryService.DisabledLocations)
}
combinedStates.SetDeliveryService(deliveryServiceName, deliveryService)
}
Expand Down Expand Up @@ -218,18 +215,3 @@ type CacheGroupNameSlice []tc.CacheGroupName
func (p CacheGroupNameSlice) Len() int { return len(p) }
func (p CacheGroupNameSlice) Less(i, j int) bool { return p[i] < p[j] }
func (p CacheGroupNameSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }

// intersection returns strings in both a and b.
// Note this modifies a and b. Specifically, it sorts them. If that isn't acceptable, pass copies of your real data.
func intersection(a []tc.CacheGroupName, b []tc.CacheGroupName) []tc.CacheGroupName {
sort.Sort(CacheGroupNameSlice(a))
sort.Sort(CacheGroupNameSlice(b))
c := []tc.CacheGroupName{} // important to initialize, so JSON is `[]` not `null`
for _, s := range a {
i := sort.Search(len(b), func(i int) bool { return b[i] >= s })
if i < len(b) && b[i] == s {
c = append(c, s)
}
}
return c
}

0 comments on commit ed8d4af

Please sign in to comment.