knative · knative-prow-robot · Feb 27, 2019 · Jan 28, 2019 · Jan 28, 2019 · Jan 29, 2019
diff --git a/cmd/autoscaler/main.go b/cmd/autoscaler/main.go
@@ -80,6 +80,7 @@ func main() {
 
 	// set up signals so we handle the first shutdown signal gracefully
 	stopCh := signals.SetupSignalHandler()
+	statsCh := make(chan *autoscaler.StatMessage, statsBufferLen)
 
 	cfg, err := clientcmd.BuildConfigFromFlags(*masterURL, *kubeconfig)
 	if err != nil {
@@ -139,7 +140,8 @@ func main() {
 	hpaInformer := kubeInformerFactory.Autoscaling().V1().HorizontalPodAutoscalers()
 
 	// uniScalerFactory depends endpointsInformer to be set.
-	multiScaler := autoscaler.NewMultiScaler(dynConfig, stopCh, uniScalerFactoryFunc(endpointsInformer), logger)
+	multiScaler := autoscaler.NewMultiScaler(
+		dynConfig, stopCh, statsCh, uniScalerFactoryFunc(endpointsInformer), statsScraperFactoryFunc(endpointsInformer), logger)
 	kpaScaler := kpa.NewKPAScaler(servingClientSet, scaleClient, logger, configMapWatcher)
 	kpaCtl := kpa.NewController(&opt, paInformer, endpointsInformer, multiScaler, kpaScaler, dynConfig)
 	hpaCtl := hpa.NewController(&opt, paInformer, hpaInformer)
@@ -171,8 +173,6 @@ func main() {
 		return hpaCtl.Run(controllerThreads, stopCh)
 	})
 
-	statsCh := make(chan *autoscaler.StatMessage, statsBufferLen)
-
 	statsServer := statserver.New(statsServerAddr, statsCh, logger)
 	eg.Go(func() error {
 		return statsServer.ListenAndServe()
@@ -237,6 +237,12 @@ func uniScalerFactoryFunc(endpointsInformer corev1informers.EndpointsInformer) f
 	}
 }
 
+func statsScraperFactoryFunc(endpointsInformer corev1informers.EndpointsInformer) func(metric *autoscaler.Metric, config *autoscaler.DynamicConfig) (autoscaler.StatsScraper, error) {
+	return func(metric *autoscaler.Metric, config *autoscaler.DynamicConfig) (autoscaler.StatsScraper, error) {
+		return autoscaler.NewServiceScraper(metric, config, endpointsInformer)
+	}
+}
+
 func labelValueOrEmpty(metric *autoscaler.Metric, labelKey string) string {
 	if metric.Labels != nil {
 		if value, ok := metric.Labels[labelKey]; ok {

diff --git a/cmd/queue/main.go b/cmd/queue/main.go
@@ -18,7 +18,6 @@ package main
 
 import (
 	"context"
-	"errors"
 	"flag"
 	"fmt"
 	"net/http"
@@ -31,7 +30,6 @@ import (
 	"github.com/knative/pkg/signals"
 
 	"github.com/knative/pkg/logging/logkey"
-	"github.com/knative/pkg/websocket"
 	"github.com/knative/serving/cmd/util"
 	activatorutil "github.com/knative/serving/pkg/activator/util"
 	"github.com/knative/serving/pkg/apis/serving/v1alpha1"
@@ -40,7 +38,6 @@ import (
 	"github.com/knative/serving/pkg/logging"
 	"github.com/knative/serving/pkg/queue"
 	"github.com/knative/serving/pkg/queue/health"
-	"github.com/knative/serving/pkg/utils"
 	"go.opencensus.io/exporter/prometheus"
 	"go.opencensus.io/stats/view"
 	"go.uber.org/zap"
@@ -61,10 +58,6 @@ const (
 	// from its configuration and propagate that to all istio-proxies
 	// in the mesh.
 	quitSleepDuration = 20 * time.Second
-
-	// Only report errors about a non-existent websocket connection after
-	// having been up and running for this long.
-	startupConnectionGrace = 10 * time.Second
 )
 
 var (
@@ -82,7 +75,6 @@ var (
 	revisionTimeoutSeconds int
 	statChan               = make(chan *autoscaler.Stat, statReportingQueueLength)
 	reqChan                = make(chan queue.ReqEvent, requestCountingQueueLength)
-	statSink               *websocket.ManagedConnection
 	logger                 *zap.SugaredLogger
 	breaker                *queue.Breaker
 
@@ -121,35 +113,12 @@ func initEnv() {
 func statReporter() {
 	for {
 		s := <-statChan
-		if err := sendStat(s); err != nil {
-			// Hide "not-established" errors until the startupConnectionGrace has passed.
-			if err != websocket.ErrConnectionNotEstablished || time.Since(startupTime) > startupConnectionGrace {
-				logger.Errorw("Error while sending stat", zap.Error(err))
-			}
+		if err := reporter.Report(float64(s.RequestCount), s.AverageConcurrentRequests); err != nil {
+			logger.Errorw("Error while sending stat", zap.Error(err))
 		}
 	}
 }
 
-// sendStat sends a single StatMessage to the autoscaler.
-func sendStat(s *autoscaler.Stat) error {
-	if statSink == nil {
-		return errors.New("stat sink not (yet) connected")
-	}
-	reporter.Report(
-		float64(s.RequestCount),
-		float64(s.AverageConcurrentRequests),
-	)
-	if healthState.IsShuttingDown() {
-		// Do not send metrics if the pods is shutting down.
-		return nil
-	}
-	sm := autoscaler.StatMessage{
-		Stat: *s,
-		Key:  servingRevisionKey,
-	}
-	return statSink.Send(sm)
-}
-
 func proxyForRequest(req *http.Request) *httputil.ReverseProxy {
 	if req.ProtoMajor == 2 {
 		return h2cProxy
@@ -277,10 +246,6 @@ func main() {
 		http.ListenAndServe(fmt.Sprintf(":%d", v1alpha1.RequestQueueMetricsPort), mux)
 	}()
 
-	// Open a websocket connection to the autoscaler
-	autoscalerEndpoint := fmt.Sprintf("ws://%s.%s.svc.%s:%d", servingAutoscaler, autoscalerNamespace, utils.GetClusterDomainName(), servingAutoscalerPort)
-	logger.Infof("Connecting to autoscaler at %s", autoscalerEndpoint)
-	statSink = websocket.NewDurableSendingConnection(autoscalerEndpoint)
 	go statReporter()
 
 	reportTicker := time.NewTicker(queue.ReporterReportingPeriod).C
@@ -328,11 +293,5 @@ func main() {
 		if err := adminServer.Shutdown(context.Background()); err != nil {
 			logger.Errorw("Failed to shutdown admin-server", zap.Error(err))
 		}
-
-		if statSink != nil {
-			if err := statSink.Close(); err != nil {
-				logger.Errorw("Failed to shutdown websocket connection", zap.Error(err))
-			}
-		}
 	}
 }
diff --git a/pkg/autoscaler/autoscaler.go b/pkg/autoscaler/autoscaler.go
@@ -36,14 +36,6 @@ const (
 	// as defined in the metrics it sends.
 	ActivatorPodName string = "activator"
 
-	// If the latest received stat from a pod is in the last activeThreshold duration,
-	// assume the pod is still active. Otherwise, the active status of a pod is
-	// unknown.
-	activeThreshold time.Duration = time.Second
-
-	// Activator pod weight is always 1
-	activatorPodWeight float64 = 1
-
 	approximateZero = 1e-8
 )
 
@@ -65,6 +57,10 @@ type Stat struct {
 	// Lameduck indicates this Pod has received a shutdown signal.
 	// Deprecated and no longer used by newly created Pods.
 	LameDuck bool
+
+	// Average number of requests currently being handled by all ready pods of a
+	// revision.
+	AverageRevConcurrency float64
 }
 
 // StatMessage wraps a Stat with identifying information so it can be routed
@@ -113,92 +109,68 @@ func (agg *totalAggregation) aggregate(stat Stat) {
 	agg.probeCount++
 }
 
-// The number of pods that are observable via stats
-// Subtracts the activator pod if its not the only pod reporting stats
-func (agg *totalAggregation) observedPods(now time.Time) float64 {
-	podCount := float64(0.0)
-	for _, pod := range agg.perPodAggregations {
-		podCount += pod.podWeight(now)
-	}
-
-	activatorsCount := len(agg.activatorsContained)
-	// Discount the activators in the pod count.
-	if activatorsCount > 0 {
-		discountedPodCount := podCount - float64(activatorsCount)
-		// Report a minimum of 1 pod if the activators are sending metrics.
-		if discountedPodCount < 1.0 {
-			return 1.0
-		}
-		return discountedPodCount
-	}
-	return podCount
-}
-
-// The observed concurrency of a revision (sum of all average concurrencies of
-// the observed pods)
-// Ignores activator sent metrics if its not the only pod reporting stats
-func (agg *totalAggregation) observedConcurrency(now time.Time) float64 {
-	accumulatedConcurrency := float64(0)
+// The observed concurrency of a revision and the observed concurrency per pod.
+// Ignores activator sent metrics if its not the only pod reporting stats.
+func (agg *totalAggregation) observedConcurrency() (float64, float64) {
+	accumulatedPodConcurrency := float64(0)
+	accumulatedRevConcurrency := float64(0)
 	activatorConcurrency := float64(0)
-	for podName, perPod := range agg.perPodAggregations {
-		if isActivator(podName) {
-			activatorConcurrency += perPod.calculateAverage(now)
+	samplePodCount := 0
+	for _, perPod := range agg.perPodAggregations {
+		if perPod.isActivator {
+			activatorConcurrency += perPod.averagePodConcurrency()
 		} else {
-			accumulatedConcurrency += perPod.calculateAverage(now)
+			accumulatedPodConcurrency += perPod.averagePodConcurrency()
+			accumulatedRevConcurrency += perPod.averageRevConcurrency()
+			samplePodCount++
 		}
 	}
-	if accumulatedConcurrency < approximateZero {
-		return activatorConcurrency
+	if samplePodCount != 0 {
+		accumulatedPodConcurrency = accumulatedPodConcurrency / float64(samplePodCount)
+		accumulatedRevConcurrency = accumulatedRevConcurrency / float64(samplePodCount)
 	}
-	return accumulatedConcurrency
-}
 
-// The observed concurrency per pod (sum of all average concurrencies
-// distributed over the observed pods)
-// Ignores activator sent metrics if its not the only pod reporting stats
-func (agg *totalAggregation) observedConcurrencyPerPod(now time.Time) float64 {
-	return divide(agg.observedConcurrency(now), agg.observedPods(now))
+	if accumulatedPodConcurrency < approximateZero {
+		// Activator is the only pod reporting stats.
+		return activatorConcurrency, activatorConcurrency
+	}
+	return accumulatedRevConcurrency, accumulatedPodConcurrency
 }
 
 // Holds an aggregation per pod
 type perPodAggregation struct {
-	accumulatedConcurrency float64
-	probeCount             int32
-	window                 time.Duration
-	latestStatTime         *time.Time
-	isActivator            bool
+	accumulatedPodConcurrency float64
+	accumulatedRevConcurrency float64
+	probeCount                int32
+	window                    time.Duration
+	latestStatTime            *time.Time
+	isActivator               bool
 }
 
 // Aggregates the given concurrency
 func (agg *perPodAggregation) aggregate(stat Stat) {
-	agg.accumulatedConcurrency += stat.AverageConcurrentRequests
+	agg.accumulatedPodConcurrency += stat.AverageConcurrentRequests
+	agg.accumulatedRevConcurrency += stat.AverageRevConcurrency
 	agg.probeCount++
 	if agg.latestStatTime == nil || agg.latestStatTime.Before(*stat.Time) {
 		agg.latestStatTime = stat.Time
 	}
 }
 
-// Calculates the average concurrency over all values given
-func (agg *perPodAggregation) calculateAverage(now time.Time) float64 {
+// Calculates the average concurrency on pod level over all values given.
+func (agg *perPodAggregation) averagePodConcurrency() float64 {
 	if agg.probeCount == 0 {
 		return 0.0
 	}
-	return agg.accumulatedConcurrency / float64(agg.probeCount) * agg.podWeight(now)
+	return agg.accumulatedPodConcurrency / float64(agg.probeCount)
 }
 
-// Calculates the pod weight. Assuming the latest stat time is the point when
-// pod became out of service.
-func (agg *perPodAggregation) podWeight(now time.Time) float64 {
-	if agg.isActivator {
-		return activatorPodWeight
-	}
-
-	gapToNow := now.Sub(*agg.latestStatTime)
-	// Less than activeThreshold means the pod is active, give 1 weight
-	if gapToNow <= activeThreshold {
-		return 1.0
+// Calculates the average concurrency on revision level over all values given.
+func (agg *perPodAggregation) averageRevConcurrency() float64 {
+	if agg.probeCount == 0 {
+		return 0.0
 	}
-	return 1.0 - (float64(gapToNow) / float64(agg.window))
+	return agg.accumulatedRevConcurrency / float64(agg.probeCount)
 }
 
 // Autoscaler stores current state of an instance of an autoscaler
@@ -283,13 +255,15 @@ func (a *Autoscaler) Scale(ctx context.Context, now time.Time) (int32, bool) {
 	config := a.Current()
 
 	stableData, panicData, lastStat := a.aggregateData(now, config.StableWindow, config.PanicWindow)
-	observedStablePods := stableData.observedPods(now)
 	// Do nothing when we have no data.
-	if observedStablePods < 1.0 {
+	if stableData.probeCount < 1 {
 		logger.Debug("No data to scale on.")
 		return 0, false
 	}
 
+	observedStableConcurrency, observedStableConcurrencyPerPod := stableData.observedConcurrency()
+	observedPanicConcurrency, observedPanicConcurrencyPerPod := panicData.observedConcurrency()
+
 	// Log system totals
 	totalCurrentQPS := int32(0)
 	totalCurrentConcurrency := float64(0)
@@ -299,27 +273,20 @@ func (a *Autoscaler) Scale(ctx context.Context, now time.Time) (int32, bool) {
 	}
 	logger.Debugf("Current QPS: %v  Current concurrent clients: %v", totalCurrentQPS, totalCurrentConcurrency)
 
-	observedPanicPods := panicData.observedPods(now)
-	observedStableConcurrency := stableData.observedConcurrency(now)
-	observedPanicConcurrency := panicData.observedConcurrency(now)
-	observedStableConcurrencyPerPod := stableData.observedConcurrencyPerPod(now)
-	observedPanicConcurrencyPerPod := panicData.observedConcurrencyPerPod(now)
-
 	target := a.targetConcurrency()
 	// Desired pod count is observed concurrency of revision over desired (stable) concurrency per pod.
 	// The scaling up rate limited to within MaxScaleUpRate.
 	desiredStablePodCount := a.podCountLimited(observedStableConcurrency/target, readyPods)
 	desiredPanicPodCount := a.podCountLimited(observedPanicConcurrency/target, readyPods)
 
-	a.reporter.ReportObservedPodCount(observedStablePods)
 	a.reporter.ReportStableRequestConcurrency(observedStableConcurrencyPerPod)
 	a.reporter.ReportPanicRequestConcurrency(observedPanicConcurrencyPerPod)
 	a.reporter.ReportTargetRequestConcurrency(target)
 
-	logger.Debugf("STABLE: Observed average %0.3f concurrency over %v seconds over %v samples over %v pods.",
-		observedStableConcurrencyPerPod, config.StableWindow, stableData.probeCount, observedStablePods)
-	logger.Debugf("PANIC: Observed average %0.3f concurrency over %v seconds over %v samples over %v pods.",
-		observedPanicConcurrencyPerPod, config.PanicWindow, panicData.probeCount, observedPanicPods)
+	logger.Debugf("STABLE: Observed average %0.3f concurrency over %v seconds over %v samples.",
+		observedStableConcurrencyPerPod, config.StableWindow, stableData.probeCount)
+	logger.Debugf("PANIC: Observed average %0.3f concurrency over %v seconds over %v samples.",
+		observedPanicConcurrencyPerPod, config.PanicWindow, panicData.probeCount)
 
 	// Stop panicking after the surge has made its way into the stable metric.
 	if a.panicking && a.panicTime.Add(config.StableWindow).Before(now) {
@@ -330,7 +297,7 @@ func (a *Autoscaler) Scale(ctx context.Context, now time.Time) (int32, bool) {
 	}
 
 	// Begin panicking when we cross the 6 second concurrency threshold.
-	if !a.panicking && observedPanicPods > 0.0 && observedPanicConcurrencyPerPod >= (target*2) {
+	if !a.panicking && panicData.probeCount > 0 && observedPanicConcurrencyPerPod >= (target*2) {
 		logger.Info("PANICKING")
 		a.panicking = true
 		a.panicTime = &now
@@ -342,7 +309,7 @@ func (a *Autoscaler) Scale(ctx context.Context, now time.Time) (int32, bool) {
 		logger.Debug("Operating in panic mode.")
 		a.reporter.ReportPanic(1)
 		if desiredPanicPodCount > a.maxPanicPods {
-			logger.Infof("Increasing pods from %v to %v.", observedPanicPods, int(desiredPanicPodCount))
+			logger.Infof("Increasing pods from %v to %v.", readyPods, int(desiredPanicPodCount))
 			a.panicTime = &now
 			a.maxPanicPods = desiredPanicPodCount
 		}
@@ -404,6 +371,8 @@ func (a *Autoscaler) podCountLimited(desiredPodCount, currentPodCount float64) f
 	return math.Min(desiredPodCount, a.Current().MaxScaleUpRate*currentPodCount)
 }
 
+// readyPods returns the ready IP count in the K8S Endpoints object for a Revision
+// via K8S Informer. This is same as ready Pod count.
 func (a *Autoscaler) readyPods() (float64, error) {
 	readyPods := 0
 	endpoints, err := a.endpointsLister.Endpoints(a.namespace).Get(a.revisionService)