Introduce WebAPI Plugin Interface, an example & AWS athena plugin (fl…

…yteorg#146) * Proposal: Common plugin interface for all service plugins * Add remote plugin interface and implemention * make properties config-friendly * Rename remote.PluginContext * Simplify ResourceMeta * New task log interface and template plugin * Update spark and pytorch plugins * Proposal: Common plugin interface for all service plugins Co-authored-by: Ketan Umare <[email protected]>
eapolinario · Jan 30, 2021 · 8a49ee2 · 8a49ee2
1 parent 5f27429
commit 8a49ee2
Show file tree

Hide file tree

Showing 59 changed files with 5,714 additions and 550 deletions.
diff --git a/copilot/go.mod b/copilot/go.mod
@@ -8,7 +8,7 @@ require (
 	github.com/gogo/protobuf v1.3.1
 	github.com/golang/protobuf v1.4.2
 	github.com/imdario/mergo v0.3.9 // indirect
-	github.com/lyft/flyteidl v0.18.9
+	github.com/lyft/flyteidl v0.18.11
 	github.com/lyft/flyteplugins v0.4.4
 	github.com/lyft/flytestdlib v0.3.9
 	github.com/mitchellh/go-ps v1.0.0

diff --git a/copilot/go.sum b/copilot/go.sum
diff --git a/go.mod b/go.mod
@@ -7,6 +7,10 @@ require (
 	github.com/Masterminds/semver v1.5.0
 	github.com/aws/amazon-sagemaker-operator-for-k8s v1.0.1-0.20200410212604-780c48ecb21a
 	github.com/aws/aws-sdk-go v1.29.23
+	github.com/aws/aws-sdk-go-v2 v1.0.0
+	github.com/aws/aws-sdk-go-v2/config v1.0.0
+	github.com/aws/aws-sdk-go-v2/service/athena v1.0.0
+	github.com/aws/aws-sdk-go-v2/service/sagemaker v1.0.0 // indirect
 	github.com/coocood/freecache v1.1.0
 	github.com/fsnotify/fsnotify v1.4.9 // indirect
 	github.com/go-test/deep v1.0.5
@@ -32,7 +36,7 @@ require (
 	k8s.io/api v0.17.3
 	k8s.io/apimachinery v0.17.3
 	k8s.io/client-go v11.0.1-0.20190918222721-c0e3722d5cf0+incompatible
-	k8s.io/utils v0.0.0-20200124190032-861946025e34 // indirect
+	k8s.io/utils v0.0.0-20200124190032-861946025e34
 	sigs.k8s.io/controller-runtime v0.5.1
 )
 

diff --git a/go.sum b/go.sum
diff --git a/go/tasks/aws/config.go b/go/tasks/aws/config.go
@@ -5,8 +5,11 @@
 package aws
 
 import (
-	"time"
+	"context"
 
+	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/aws/retry"
+	awsConfig "github.com/aws/aws-sdk-go-v2/config"
 	"github.com/lyft/flytestdlib/config"
 
 	pluginsConfig "github.com/lyft/flyteplugins/go/tasks/config"
@@ -18,22 +21,19 @@ const ConfigSectionKey = "aws"
 
 var (
 	defaultConfig = &Config{
-		Region:               "us-east-1",
-		MaxErrorStringLength: 150,
-		Retries:              3,
-		CatalogCacheTimeout:  config.Duration{Duration: time.Second * 5},
+		Region:  "us-east-2",
+		Retries: 3,
 	}
 
 	configSection = pluginsConfig.MustRegisterSubSection(ConfigSectionKey, defaultConfig)
 )
 
 // Config section for AWS Package
 type Config struct {
-	Region               string          `json:"region" pflag:",AWS Region to connect to."`
-	AccountID            string          `json:"accountId" pflag:",AWS Account Identifier."`
-	Retries              int             `json:"retries" pflag:",Number of retries."`
-	MaxErrorStringLength int             `json:"maxErrorLength" pflag:",Maximum size of error messages."`
-	CatalogCacheTimeout  config.Duration `json:"catalog-timeout" pflag:"\"5s\",Timeout duration for checking catalog for all batch tasks"`
+	Region    string            `json:"region" pflag:",AWS Region to connect to."`
+	AccountID string            `json:"accountId" pflag:",AWS Account Identifier."`
+	Retries   int               `json:"retries" pflag:",Number of retries."`
+	LogLevel  aws.ClientLogMode `json:"logLevel" pflag:"-,Defines the Sdk Log Level."`
 }
 
 type RateLimiterConfig struct {
@@ -46,6 +46,22 @@ func GetConfig() *Config {
 	return configSection.GetConfig().(*Config)
 }
 
+func (cfg Config) GetSdkConfig() (aws.Config, error) {
+	sdkConfig, err := awsConfig.LoadDefaultConfig(context.TODO(),
+		awsConfig.WithRegion(cfg.Region),
+		awsConfig.WithRetryer(func() aws.Retryer {
+			return retry.NewStandard(func(options *retry.StandardOptions) {
+				options.MaxAttempts = cfg.Retries
+			})
+		}),
+		awsConfig.WithClientLogMode(cfg.LogLevel))
+	if err != nil {
+		return aws.Config{}, err
+	}
+
+	return sdkConfig, nil
+}
+
 func MustRegisterSubSection(key config.SectionKey, cfg config.Config) config.Section {
 	return configSection.MustRegisterSection(key, cfg)
 }
diff --git a/go/tasks/aws/config_flags.go b/go/tasks/aws/config_flags.go
diff --git a/go/tasks/aws/config_flags_test.go b/go/tasks/aws/config_flags_test.go
diff --git a/go/tasks/pluginmachinery/core/allocationstatus_enumer.go b/go/tasks/pluginmachinery/core/allocationstatus_enumer.go
diff --git a/go/tasks/pluginmachinery/core/resource_manager.go b/go/tasks/pluginmachinery/core/resource_manager.go
@@ -4,21 +4,23 @@ import (
 	"context"
 )
 
-type AllocationStatus string
+//go:generate enumer -type=AllocationStatus -trimprefix=AllocationStatus
+
+type AllocationStatus int
 
 const (
 	// This is the enum returned when there's an error
-	AllocationUndefined AllocationStatus = "ResourceGranted"
+	AllocationUndefined AllocationStatus = iota
 
 	// Go for it
-	AllocationStatusGranted AllocationStatus = "ResourceGranted"
+	AllocationStatusGranted
 
 	// This means that no resources are available globally.  This is the only rejection message we use right now.
-	AllocationStatusExhausted AllocationStatus = "ResourceExhausted"
+	AllocationStatusExhausted
 
 	// We're not currently using this - but this would indicate that things globally are okay, but that your
 	// own namespace is too busy
-	AllocationStatusNamespaceQuotaExceeded AllocationStatus = "NamespaceQuotaExceeded"
+	AllocationStatusNamespaceQuotaExceeded
 )
 
 const namespaceSeparator = ":"

diff --git a/go/tasks/pluginmachinery/internal/webapi/allocation_token.go b/go/tasks/pluginmachinery/internal/webapi/allocation_token.go
@@ -0,0 +1,93 @@
+package webapi
+
+import (
+	"context"
+	"fmt"
+
+	"k8s.io/utils/clock"
+
+	"github.com/lyft/flytestdlib/logger"
+
+	"github.com/lyft/flyteplugins/go/tasks/pluginmachinery/webapi"
+
+	"github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core"
+)
+
+type tokenAllocator struct {
+	clock clock.Clock
+}
+
+func newTokenAllocator(c clock.Clock) tokenAllocator {
+	return tokenAllocator{
+		clock: c,
+	}
+}
+
+func (a tokenAllocator) allocateToken(ctx context.Context, p webapi.AsyncPlugin, tCtx core.TaskExecutionContext, state *State, metrics Metrics) (
+	newState *State, phaseInfo core.PhaseInfo, err error) {
+	if len(p.GetConfig().ResourceQuotas) == 0 {
+		// No quota, return success
+		return &State{
+			AllocationTokenRequestStartTime: a.clock.Now(),
+			Phase:                           PhaseAllocationTokenAcquired,
+		}, core.PhaseInfoQueued(a.clock.Now(), 0, "No allocation token required"), nil
+	}
+
+	ns, constraints, err := p.ResourceRequirements(ctx, tCtx)
+	if err != nil {
+		logger.Errorf(ctx, "Failed to calculate resource requirements for task. Error: %v", err)
+		return nil, core.PhaseInfo{}, err
+	}
+
+	token := tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName()
+	allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, ns, token, constraints)
+	if err != nil {
+		logger.Errorf(ctx, "Failed to allocate resources for task. Error: %v", err)
+		return nil, core.PhaseInfo{}, err
+	}
+
+	switch allocationStatus {
+	case core.AllocationStatusGranted:
+		metrics.AllocationGranted.Inc(ctx)
+		metrics.ResourceWaitTime.Observe(float64(a.clock.Since(state.AllocationTokenRequestStartTime).Milliseconds()))
+		return &State{
+			AllocationTokenRequestStartTime: a.clock.Now(),
+			Phase:                           PhaseAllocationTokenAcquired,
+		}, core.PhaseInfoQueued(a.clock.Now(), 0, "Allocation token required"), nil
+	case core.AllocationStatusNamespaceQuotaExceeded:
+	case core.AllocationStatusExhausted:
+		metrics.AllocationNotGranted.Inc(ctx)
+		logger.Infof(ctx, "Couldn't allocate token because allocation status is [%v].", allocationStatus.String())
+		startTime := state.AllocationTokenRequestStartTime
+		if startTime.IsZero() {
+			startTime = a.clock.Now()
+		}
+
+		return &State{
+				AllocationTokenRequestStartTime: startTime,
+				Phase:                           PhaseNotStarted,
+			}, core.PhaseInfoQueued(a.clock.Now(), 0,
+				fmt.Sprintf("Quota for task has exceeded. The request is enqueued.")), nil
+	}
+
+	return nil, core.PhaseInfo{}, fmt.Errorf("allocation status undefined [%v]", allocationStatus)
+}
+
+func (a tokenAllocator) releaseToken(ctx context.Context, p webapi.AsyncPlugin, tCtx core.TaskExecutionContext, metrics Metrics) error {
+	ns, _, err := p.ResourceRequirements(ctx, tCtx)
+	if err != nil {
+		logger.Errorf(ctx, "Failed to calculate resource requirements for task. Error: %v", err)
+		return err
+	}
+
+	token := tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName()
+	err = tCtx.ResourceManager().ReleaseResource(ctx, ns, token)
+	if err != nil {
+		metrics.ResourceReleaseFailed.Inc(ctx)
+		logger.Errorf(ctx, "Failed to release resources for task. Error: %v", err)
+		return err
+	}
+
+	metrics.ResourceReleased.Inc(ctx)
+	return nil
+}