From 168e6e17178cc8fe8d18470a313133a1b0fa3d3d Mon Sep 17 00:00:00 2001 From: Igor Zibarev Date: Tue, 9 Jan 2018 10:18:07 +0300 Subject: [PATCH] Implement reindex command Ref: #22 --- README.md | 9 +- cmd/helms3/delete.go | 2 +- cmd/helms3/main.go | 11 +++ cmd/helms3/proxy.go | 5 ++ cmd/helms3/push.go | 21 +++-- cmd/helms3/reindex.go | 70 +++++++++++++++ docs/best-practice.md | 41 +++++++++ docs/usage-cost.md | 19 ++++ pkg/awss3/storage.go | 203 ++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 371 insertions(+), 10 deletions(-) create mode 100644 cmd/helms3/reindex.go create mode 100644 docs/best-practice.md create mode 100644 docs/usage-cost.md diff --git a/README.md b/README.md index 89b45140..ec3ea762 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,14 @@ The chart is deleted from the repo: ## Uninstall $ helm plugin remove s3 - + +## Documentation + +Additional documentation is available in the [docs](docs) directory. This currently includes: +- estimated [usage cost calculation](docs/usage-cost.md) +- [best practices](docs/best-practice.md) +for organizing your repositories. + ## Contributing Contributions are welcome. Please see [these instructions](.github/CONTRIBUTING.md) diff --git a/cmd/helms3/delete.go b/cmd/helms3/delete.go index 08c64f11..016ec986 100644 --- a/cmd/helms3/delete.go +++ b/cmd/helms3/delete.go @@ -59,7 +59,7 @@ func runDelete(name, version, repoName string) error { if err := storage.Delete(ctx, uri); err != nil { return errors.WithMessage(err, "delete chart file from s3") } - if _, err := storage.Upload(ctx, repoEntry.URL+"/index.yaml", idxReader); err != nil { + if err := storage.PutIndex(ctx, repoEntry.URL, idxReader); err != nil { return errors.WithMessage(err, "upload new index to s3") } diff --git a/cmd/helms3/main.go b/cmd/helms3/main.go index 774d0fca..45407bab 100644 --- a/cmd/helms3/main.go +++ b/cmd/helms3/main.go @@ -17,6 +17,7 @@ const ( actionVersion = "version" actionInit = "init" actionPush = "push" + actionReindex = "reindex" actionDelete = "delete" defaultTimeout = time.Second * 5 @@ -46,6 +47,11 @@ func main() { Required(). String() + reindexCmd := cli.Command(actionReindex, "Reindex the repository.") + reindexTargetRepository := reindexCmd.Arg("repo", "Target repository to reindex"). + Required(). + String() + deleteCmd := cli.Command(actionDelete, "Delete chart from the repository.").Alias("del") deleteChartName := deleteCmd.Arg("chartName", "Name of chart to delete"). Required(). @@ -81,6 +87,11 @@ func main() { } return + case actionReindex: + if err := runReindex(*reindexTargetRepository); err != nil { + log.Fatal(err) + } + case actionDelete: if err := runDelete(*deleteChartName, *deleteChartVersion, *deleteTargetRepository); err != nil { log.Fatal(err) diff --git a/cmd/helms3/proxy.go b/cmd/helms3/proxy.go index db4a896a..1b46bf4f 100644 --- a/cmd/helms3/proxy.go +++ b/cmd/helms3/proxy.go @@ -3,6 +3,8 @@ package main import ( "context" "fmt" + "path" + "strings" "github.com/pkg/errors" @@ -18,6 +20,9 @@ func runProxy(uri string) error { b, err := storage.FetchRaw(ctx, uri) if err != nil { + if strings.HasSuffix(uri, "index.yaml") && err == awss3.ErrObjectNotFound { + return fmt.Errorf("The index file does not exist by the path %s. If you haven't initialized the repository yet, try running \"helm s3 init %s\"", uri, path.Dir(uri)) + } return errors.WithMessage(err, "fetch from s3") } diff --git a/cmd/helms3/push.go b/cmd/helms3/push.go index ae4d1419..70ce1b88 100644 --- a/cmd/helms3/push.go +++ b/cmd/helms3/push.go @@ -2,6 +2,7 @@ package main import ( "context" + "encoding/json" "fmt" "os" "path/filepath" @@ -52,25 +53,29 @@ func runPush(chartPath string, repoName string) error { return err } + hash, err := provenance.DigestFile(fname) + if err != nil { + return errors.WithMessage(err, "get chart digest") + } + fchart, err := os.Open(fname) if err != nil { return errors.Wrap(err, "open chart file") } - if _, err := storage.Upload(ctx, repoEntry.URL+"/"+fname, fchart); err != nil { + serializedChartMeta, err := json.Marshal(chart.Metadata) + if err != nil { + return errors.Wrap(err, "encode chart metadata to json") + } + + if _, err := storage.PutChart(ctx, repoEntry.URL+"/"+fname, fchart, string(serializedChartMeta), hash); err != nil { return errors.WithMessage(err, "upload chart to s3") } - // Next, update the repository index. // The gap between index fetching and uploading should be as small as // possible to make the best effort to avoid race conditions. // See https://github.com/hypnoglow/helm-s3/issues/18 for more info. - hash, err := provenance.DigestFile(fname) - if err != nil { - return errors.WithMessage(err, "get chart digest") - } - // Fetch current index, update it and upload it back. b, err := storage.FetchRaw(ctx, repoEntry.URL+"/index.yaml") @@ -91,7 +96,7 @@ func runPush(chartPath string, repoName string) error { return errors.WithMessage(err, "get index reader") } - if _, err := storage.Upload(ctx, repoEntry.URL+"/index.yaml", idxReader); err != nil { + if err := storage.PutIndex(ctx, repoEntry.URL, idxReader); err != nil { return errors.WithMessage(err, "upload index to s3") } diff --git a/cmd/helms3/reindex.go b/cmd/helms3/reindex.go new file mode 100644 index 00000000..2d344d63 --- /dev/null +++ b/cmd/helms3/reindex.go @@ -0,0 +1,70 @@ +package main + +import ( + "context" + "time" + + "github.com/pkg/errors" + + "github.com/hypnoglow/helm-s3/pkg/awss3" + "github.com/hypnoglow/helm-s3/pkg/awsutil" + "github.com/hypnoglow/helm-s3/pkg/helmutil" + "github.com/hypnoglow/helm-s3/pkg/index" +) + +const ( + reindexCommandDefaultTimeput = time.Second * 15 +) + +func runReindex(repoName string) error { + // Just one big timeout for the whole operation. + ctx, cancel := context.WithTimeout(context.Background(), reindexCommandDefaultTimeput) + defer cancel() + + ctx = ctx + + repoEntry, err := helmutil.LookupRepoEntry(repoName) + if err != nil { + return err + } + + awsConfig, err := awsutil.Config() + if err != nil { + return errors.Wrap(err, "get aws config") + } + + storage := awss3.NewStorage(awsConfig) + + items := make(chan awss3.ChartInfo, 1) + errs := make(chan error, 1) + + go storage.Traverse(context.TODO(), repoEntry.URL, items, errs) + + builtIndex := make(chan *index.Index, 1) + go func() { + idx := index.New() + for item := range items { + idx.Add(item.Meta, item.Filename, repoEntry.URL, item.Hash) + } + idx.SortEntries() + + builtIndex <- idx + }() + + for err := range errs { + return errors.Wrap(err, "traverse the chart repository") + } + + idx := <-builtIndex + + r, err := idx.Reader() + if err != nil { + return errors.Wrap(err, "get index reader") + } + + if err := storage.PutIndex(context.TODO(), repoEntry.URL, r); err != nil { + return errors.Wrap(err, "upload index to the repository") + } + + return nil +} diff --git a/docs/best-practice.md b/docs/best-practice.md new file mode 100644 index 00000000..0dfd0201 --- /dev/null +++ b/docs/best-practice.md @@ -0,0 +1,41 @@ +# Best Practice + +## Reindexing your repository + +In short, due to limitations of AWS your chart repository index can be broken +by accident. This means that it may not reflect the "real" state of your chart +files in S3 bucket. Nothing serious, but can be annoying. + +To workaround this, the `helm s3 reindex ` command is available. *Note: this +operation is is [much more expensive](usage-cost.md#reindex) than other in this plugin*. + +## Organizing your repositories + +A chart repository file structure is always flat. +It cannot contain nested directories. + +The number of AWS S3 requests for reindex operation depends on your repository structure. +Due to limitations of AWS S3 API you cannot list objects of the folder under the key + excluding subfolders. `ListObjects` only can lists objects under the key recursively. + +The plugin code makes its best to ignore subfolders, because chart repository is always flat. +But still, not all cases are covered. + +Imagine the worst case scenario: you have 100 chart files in your repository, which is the +bucket root. And 1 million files in the "foo-bar" subfolder, which are not related to +the chart repository. In this case the plugin **have to** call `ListObjects` +about 1000 times (1000 objects per call) to make sure it did not miss any chart file. + +By that, the golden rule is to **never have subfolders in your chart repository folder**. + +So, there are two good options for your chart repository file structure inside S3 bucket: + +1. One bucket - one repository. Create a bucket "yourcompany-charts-stable", or +"yourcompany-productname-charts" and use the bucket root as your chart repository. +In this case, never put any other files in that bucket. + +2. One bucket - many repositories, each in separate subfolder. Create a bucket +"yourcompany-charts". Create a subfolder in it for each repository you need, for +example "stable" and "testing". Another option is to separate the repositories +by the product or by group of services, for example "backoffice", "order-processing", etc. +And again, never put any other files in the repository folder. \ No newline at end of file diff --git a/docs/usage-cost.md b/docs/usage-cost.md new file mode 100644 index 00000000..ca1b2fc0 --- /dev/null +++ b/docs/usage-cost.md @@ -0,0 +1,19 @@ +# Usage pricing + +I hope this document helps you to calculate the AWS S3 usage cost for your use case. + +Disclaimer: the plugin author is not responsible for your unexpected expenses. + +**Make sure to consult the pricing for your region [here](https://aws.amazon.com/s3/pricing)!** + +## Reindex + +`helm s3 reindex ` command is much more expensive operation than other in +this plugin. For example, reindexing a repository with 1000 chart files in it +results in 1 GET (`ListObjects`) request and 1000 HEAD (`HeadObject`) requests. +Plus it can make additional GET (`GetObject`) requests if it did not found +required metadata in the HEAD request response. + +At the moment of writing this document the price for HEAD/GET requests in `eu-central-1` is `$0.0043 for 10 000 requests`. +So the whole reindex operation for this case may cost approximately **$0.00043** or even **$0.00086**. +This seems small, but multiple reindex operations per day may hurt your budget. \ No newline at end of file diff --git a/pkg/awss3/storage.go b/pkg/awss3/storage.go index 6c79a6d5..840e2c01 100644 --- a/pkg/awss3/storage.go +++ b/pkg/awss3/storage.go @@ -1,21 +1,32 @@ package awss3 import ( + "bytes" "context" + "encoding/json" "fmt" "io" "net/url" "strings" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/s3" "github.com/aws/aws-sdk-go/service/s3/s3manager" "github.com/pkg/errors" + "k8s.io/helm/pkg/chartutil" + "k8s.io/helm/pkg/proto/hapi/chart" + "k8s.io/helm/pkg/provenance" "github.com/hypnoglow/helm-s3/pkg/awsutil" ) +var ( + ErrBucketNotFound = errors.New("bucket not found") + ErrObjectNotFound = errors.New("object not found") +) + // New returns a new Storage. func New() *Storage { return &Storage{} @@ -26,6 +37,126 @@ type Storage struct { session *session.Session } +// Traverse traverses all charts in the repository. +// It writes an info item about every chart to items, and errors to errs. +// It always closes both channels when returns. +func (s *Storage) Traverse(ctx context.Context, repoURI string, items chan<- ChartInfo, errs chan<- error) { + defer close(items) + defer close(errs) + + if err := s.initSession(); err != nil { + errs <- err + return + } + + bucket, key, err := parseURI(repoURI) + if err != nil { + errs <- err + return + } + + client := s3.New(s.session) + + var continuationToken *string + for { + listOut, err := client.ListObjectsV2(&s3.ListObjectsV2Input{ + Bucket: aws.String(bucket), + Prefix: aws.String(key), + ContinuationToken: continuationToken, + }) + if err != nil { + errs <- errors.Wrap(err, "list s3 bucket objects") + return + } + + for _, obj := range listOut.Contents { + if strings.Contains(*obj.Key, "/") { + // This is a subfolder. Ignore it, because chart repository + // is flat and cannot contain nested directories. + continue + } + if *obj.Key == "index.yaml" { + // Ignore the index itself. + continue + } + + metaOut, err := client.HeadObject(&s3.HeadObjectInput{ + Bucket: aws.String(bucket), + Key: obj.Key, + }) + if err != nil { + errs <- errors.Wrap(err, "head s3 object") + return + } + + reindexItem := ChartInfo{Filename: *obj.Key} + + // PROCESS THE OBJECT + serializedChartMeta, hasMeta := metaOut.Metadata[strings.Title(metaChartMetadata)] + chartDigest, hasDigest := metaOut.Metadata[strings.Title(metaChartDigest)] + if !hasMeta || !hasDigest { + // TODO: This is deprecated. Remove in next major release? Or not? + // All charts pushed to the repository + // since "reindex" command implementation should have these + // meta fields. + // But should we support the case when user manually uploads + // the ch to the bucket? In this case, there will be no + // such meta fields. + + // Anyway, in this case we have to download the ch file itself. + objectOut, err := client.GetObject(&s3.GetObjectInput{ + Bucket: aws.String(bucket), + Key: obj.Key, + }) + if err != nil { + errs <- errors.Wrap(err, "get s3 object") + return + } + + buf := &bytes.Buffer{} + tr := io.TeeReader(objectOut.Body, buf) + + ch, err := chartutil.LoadArchive(tr) + objectOut.Body.Close() + if err != nil { + errs <- errors.Wrap(err, "load archive from s3 object") + return + } + + reindexItem.Meta = ch.Metadata + reindexItem.Hash, err = provenance.Digest(buf) + if err != nil { + errs <- errors.WithMessage(err, "get chart hash") + return + } + } else { + reindexItem.Meta = &chart.Metadata{} + if err := json.Unmarshal([]byte(*serializedChartMeta), reindexItem.Meta); err != nil { + errs <- errors.Wrap(err, "unserialize chart meta") + return + } + + reindexItem.Hash = *chartDigest + } + + // process meta and hash + items <- reindexItem + } + + // Decide if need to load more objects. + if listOut.NextContinuationToken == nil { + break + } + continuationToken = listOut.NextContinuationToken + } +} + +type ChartInfo struct { + Meta *chart.Metadata + Filename string + Hash string +} + // FetchRaw downloads the object from URI and returns it in the form of byte slice. // uri must be in the form of s3 protocol: s3://bucket-name/key[...]. func (s *Storage) FetchRaw(ctx context.Context, uri string) ([]byte, error) { @@ -47,6 +178,14 @@ func (s *Storage) FetchRaw(ctx context.Context, uri string) ([]byte, error) { Key: aws.String(key), }) if err != nil { + if ae, ok := err.(awserr.Error); ok { + if ae.Code() == s3.ErrCodeNoSuchBucket { + return nil, ErrBucketNotFound + } + if ae.Code() == s3.ErrCodeNoSuchKey { + return nil, ErrObjectNotFound + } + } return nil, errors.Wrap(err, "fetch object from s3") } @@ -55,6 +194,8 @@ func (s *Storage) FetchRaw(ctx context.Context, uri string) ([]byte, error) { // Upload uploads the object read from r to S3 by path uri. // uri must be in the form of s3 protocol: s3://bucket-name/key[...]. +// +// Deprecated: use PutChart or PutIndex instead. func (s *Storage) Upload(ctx context.Context, uri string, r io.Reader) (string, error) { if err := s.initSession(); err != nil { return "", err @@ -79,6 +220,63 @@ func (s *Storage) Upload(ctx context.Context, uri string, r io.Reader) (string, return result.Location, nil } +func (s *Storage) PutChart(ctx context.Context, uri string, r io.Reader, chartMeta, chartDigest string) (string, error) { + if err := s.initSession(); err != nil { + return "", err + } + + bucket, key, err := parseURI(uri) + if err != nil { + return "", err + } + + result, err := s3manager.NewUploader(s.session).UploadWithContext( + ctx, + &s3manager.UploadInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + Body: r, + Metadata: map[string]*string{ + metaChartMetadata: aws.String(chartMeta), + metaChartDigest: aws.String(chartDigest), + }, + }) + if err != nil { + return "", errors.Wrap(err, "upload object to s3") + } + + return result.Location, nil +} + +func (s *Storage) PutIndex(ctx context.Context, uri string, r io.Reader) error { + if strings.HasPrefix(uri, "index.yaml") { + return errors.New("uri must not contain \"index.yaml\" suffix, it appends automatically") + } + uri += "/index.yaml" + + if err := s.initSession(); err != nil { + return err + } + + bucket, key, err := parseURI(uri) + if err != nil { + return err + } + + _, err = s3manager.NewUploader(s.session).UploadWithContext( + ctx, + &s3manager.UploadInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + Body: r, + }) + if err != nil { + return errors.Wrap(err, "upload index to S3 bucket") + } + + return nil +} + // Delete deletes the object by uri. // uri must be in the form of s3 protocol: s3://bucket-name/key[...]. func (s *Storage) Delete(ctx context.Context, uri string) error { @@ -130,3 +328,8 @@ func parseURI(uri string) (bucket, key string, err error) { bucket, key = u.Host, strings.TrimPrefix(u.Path, "/") return bucket, key, nil } + +const ( + metaChartMetadata = "chart-metadata" + metaChartDigest = "chart-digest" +)