From eb2257868f9d917038476d8c05ddf6c1baecdca6 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Mon, 27 May 2019 13:33:00 +0200 Subject: [PATCH] crfs: make it a plugin for fuse-overlayfs it requires this change in fuse-overlays: https://github.com/containers/fuse-overlayfs/pull/79 the crfs plugin can be loaded using: -oplugins=crfs.so,lowerdir=//crfs/DATA/lower where DATA is the base64 representation of a resource as: file:///path/to/layer.stargz or: https://www.example.com/layer.stargz Signed-off-by: Giuseppe Scrivano --- crfs.go | 1139 +++++++---------------------------------------- plugin/Makefile | 2 + plugin/crfs.c | 114 +++++ plugin/crfs.so | Bin 0 -> 17272 bytes 4 files changed, 284 insertions(+), 971 deletions(-) create mode 100644 plugin/Makefile create mode 100644 plugin/crfs.c create mode 100755 plugin/crfs.so diff --git a/crfs.go b/crfs.go index 586bfd5..56fd918 100644 --- a/crfs.go +++ b/crfs.go @@ -2,787 +2,169 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// The crfs command runs the Container Registry Filesystem, providing a read-only -// FUSE filesystem for container images. -// -// For purposes of documentation, we'll assume you've mounted this at /crfs. -// -// Currently (as of 2019-03-21) it only mounts a single layer at the top level. -// In the future it'll have paths like: -// -// /crfs/image/gcr.io/foo-proj/image/latest -// /crfs/layer/gcr.io/foo-proj/image/latest/xxxxxxxxxxxxxx -// -// For mounting a squashed image and a layer, respectively, with the -// host, owner, image name, and version encoded in the path -// components. package main import ( "context" - "encoding/json" - "errors" - "flag" + "encoding/base64" "fmt" "io" - "io/ioutil" - "log" "net/http" "os" - "regexp" - "sort" + "path/filepath" "strconv" "strings" - "sync" - "sync/atomic" "syscall" "time" - "unsafe" - "bazil.org/fuse" - fspkg "bazil.org/fuse/fs" - "cloud.google.com/go/compute/metadata" "github.com/google/crfs/stargz" - namepkg "github.com/google/go-containerregistry/pkg/name" - "github.com/google/go-containerregistry/pkg/v1/google" "golang.org/x/sys/unix" ) -const debug = false - -var ( - fuseDebug = flag.Bool("fuse_debug", false, "enable verbose FUSE debugging") +const ( + LayerModeMetadata = "1" + LayerModeDirectory = "2" + LayerModeFile = "4" ) -func usage() { - fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) - fmt.Fprintf(os.Stderr, " %s (defaults to /crfs)\n", os.Args[0]) - flag.PrintDefaults() -} +func createEntry(reader *stargz.Reader, root *stargz.TOCEntry, fullpath, workdir, target string, mode string) error { + destpath := filepath.Join(target, fullpath) + skipChmod := false -func main() { - flag.Parse() - mntPoint := "/crfs" - if flag.NArg() > 1 { - usage() - os.Exit(2) - } - if flag.NArg() == 1 { - mntPoint = flag.Arg(0) - } - if *fuseDebug { - fuse.Debug = func(msg interface{}) { - log.Printf("fuse debug: %v", msg) + if mode != LayerModeDirectory { + _, err := os.Stat(destpath) + if err == nil { + return nil + } + if !os.IsNotExist(err) { + return err } } + switch root.Type { + case "reg": + fr, err := reader.OpenFile(fullpath) + if err != nil { + return err + } - log.Printf("crfs: mounting") - c, err := fuse.Mount(mntPoint, fuse.FSName("crfs"), fuse.Subtype("crfs"), fuse.ReadOnly(), fuse.AllowOther()) - if err != nil { - log.Fatal(err) - } - defer c.Close() - defer fuse.Unmount(mntPoint) - - log.Printf("crfs: serving") - fs := new(FS) - err = fspkg.Serve(c, fs) - if err != nil { - log.Fatal(err) - } - - // check if the mount process has an error to report - <-c.Ready - if err := c.MountError; err != nil { - log.Fatal(err) - } -} - -// FS is the CRFS filesystem. -// It implements https://godoc.org/bazil.org/fuse/fs#FS -type FS struct { - // TODO: options, probably. logger, etc. -} - -// Root returns the root filesystem node for the CRFS filesystem. -// See https://godoc.org/bazil.org/fuse/fs#FS -func (fs *FS) Root() (fspkg.Node, error) { - return &rootNode{ - fs: fs, - dirEnts: dirEnts{initChildren: func(de *dirEnts) { - de.m["layers"] = &dirEnt{ - dtype: fuse.DT_Dir, - lookupNode: func(inode uint64) (fspkg.Node, error) { - return newLayersRoot(fs, inode), nil - }, - } - de.m["images"] = &dirEnt{ - dtype: fuse.DT_Dir, - lookupNode: func(inode uint64) (fspkg.Node, error) { - return &imagesRoot{fs: fs, inode: inode}, nil - }, - } - de.m["README-crfs.txt"] = &dirEnt{ - dtype: fuse.DT_File, - lookupNode: func(inode uint64) (fspkg.Node, error) { - return &staticFile{ - inode: inode, - contents: "This is CRFS. See https://github.com/google/crfs.\n", - }, nil - }, - } - }}, - }, nil -} - -// imagesOfHost returns the images for the given registry host and -// owner (e.g. GCP project name). -// -// Note that this is gcr.io specific as there's no way in the Registry -// protocol to do this. So this won't work for index.docker.io. We'll -// need to do something else there. -// TODO: something else for docker hub. -func (fs *FS) imagesOfHost(ctx context.Context, host, owner string) (imageNames []string, err error) { - req, err := http.NewRequest("GET", "https://"+host+"/v2/"+owner+"/tags/list", nil) - if err != nil { - return nil, err - } - // TODO: auth. This works for public stuff so far, though. - req = req.WithContext(ctx) - res, err := http.DefaultClient.Do(req) - if err != nil { - return nil, err - } - defer res.Body.Close() - if res.StatusCode != 200 { - return nil, errors.New(res.Status) - } - var resj struct { - Images []string `json:"child"` - } - if err := json.NewDecoder(res.Body).Decode(&resj); err != nil { - return nil, err - } - sort.Strings(resj.Images) - return resj.Images, nil -} - -type manifest struct { - SchemaVersion int `json:"schemaVersion"` - MediaType string `json:"mediaType"` - Config *blobRef `json:"config"` - Layers []*blobRef `json:"layers"` -} - -type blobRef struct { - Size int64 `json:"size"` - MediaType string `json:"mediaType"` - Digest string `json:"digest"` -} - -func (fs *FS) getManifest(ctx context.Context, host, owner, image, ref string) (*manifest, error) { - urlStr := "https://" + host + "/v2/" + owner + "/" + image + "/manifests/" + ref - req, err := http.NewRequest("GET", urlStr, nil) - if err != nil { - return nil, err - } - // TODO: auth. This works for public stuff so far, though. - req = req.WithContext(ctx) - req.Header.Set("Accept", "*") // application/vnd.docker.distribution.manifest.v2+json - res, err := http.DefaultClient.Do(req) - if err != nil { - return nil, err - } - defer res.Body.Close() - if res.StatusCode != 200 { - slurp, _ := ioutil.ReadAll(res.Body) - return nil, fmt.Errorf("non-200 for %q: %v, %q", urlStr, res.Status, slurp) - } - resj := new(manifest) - if err := json.NewDecoder(res.Body).Decode(resj); err != nil { - return nil, err - } - return resj, nil -} - -func (fs *FS) getConfig(ctx context.Context, host, owner, image, ref string) (string, error) { - urlStr := "https://" + host + "/v2/" + owner + "/" + image + "/blobs/" + ref - req, err := http.NewRequest("GET", urlStr, nil) - if err != nil { - return "", err - } - req = req.WithContext(ctx) - res, err := http.DefaultClient.Do(req) - if err != nil { - return "", err - } - defer res.Body.Close() - if res.StatusCode != 200 { - slurp, _ := ioutil.ReadAll(res.Body) - return "", fmt.Errorf("non-200 for %q: %v, %q", urlStr, res.Status, slurp) - } - slurp, err := ioutil.ReadAll(res.Body) - return string(slurp), err -} - -type dirEnt struct { - lazyInode - dtype fuse.DirentType - lookupNode func(inode uint64) (fspkg.Node, error) -} - -type dirEnts struct { - initOnce sync.Once - initChildren func(*dirEnts) - mu sync.Mutex - m map[string]*dirEnt -} - -func (de *dirEnts) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - de.condInit() - de.mu.Lock() - defer de.mu.Unlock() - e, ok := de.m[name] - if !ok { - log.Printf("returning ENOENT for name %q", name) - return nil, fuse.ENOENT - } - if e.lookupNode == nil { - log.Printf("node %q has no lookupNode defined", name) - return nil, fuse.ENOENT - } - return e.lookupNode(e.inode()) -} - -func (de *dirEnts) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - de.condInit() - de.mu.Lock() - defer de.mu.Unlock() - ents = make([]fuse.Dirent, 0, len(de.m)) - for name, e := range de.m { - ents = append(ents, fuse.Dirent{ - Name: name, - Inode: e.inode(), - Type: e.dtype, - }) - } - sort.Slice(ents, func(i, j int) bool { return ents[i].Name < ents[j].Name }) - return ents, nil -} - -func (de *dirEnts) condInit() { de.initOnce.Do(de.doInit) } -func (de *dirEnts) doInit() { - de.m = map[string]*dirEnt{} - if de.initChildren != nil { - de.initChildren(de) - } -} - -// atomicInodeIncr holds the most previously allocate global inode number. -// It should only be accessed/incremented with sync/atomic. -var atomicInodeIncr uint32 - -// lazyInode is a lazily-allocated inode number. -// -// We only use 32 bits out of 64 to leave room for overlayfs to play -// games with the upper bits. TODO: maybe that's not necessary. -type lazyInode struct{ v uint32 } + fw, err := os.OpenFile(destpath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.FileMode(root.Mode)) + if err != nil { + return err + } + defer fw.Close() -func (si *lazyInode) inode() uint64 { - for { - v := atomic.LoadUint32(&si.v) - if v != 0 { - return uint64(v) + _, err = io.Copy(fw, fr) + if err != nil { + return err } - v = atomic.AddUint32(&atomicInodeIncr, 1) - if atomic.CompareAndSwapUint32(&si.v, 0, v) { - return uint64(v) + case "dir": + err := os.Mkdir(destpath, os.FileMode(root.Mode)) + if err != nil && !os.IsExist(err) { + return err } - } -} - -// childInodeNumberCache is a temporary, lazily solution to having -// stable inode numbers in node types where we haven't yet pushed it -// down properly. This map grows forever (which is bad) and maps the -// tuple (parent directory inode, child name string) to the child's -// inode number. Its map key type is inodeAndString -var childInodeNumberCache sync.Map - -type inodeAndString struct { - inode uint64 - childName string -} - -func getOrMakeChildInode(inode uint64, childName string) uint64 { - key := inodeAndString{inode, childName} - if v, ok := childInodeNumberCache.Load(key); ok { - log.Printf("re-using inode %v/%q = %v", inode, childName, v) - return v.(uint64) - } - actual, loaded := childInodeNumberCache.LoadOrStore(key, uint64(atomic.AddUint32(&atomicInodeIncr, 1))) - if loaded { - log.Printf("race lost creating inode %v/%q = %v", inode, childName, actual) - } else { - log.Printf("created inode %v/%q = %v", inode, childName, actual) - } - return actual.(uint64) -} - -// rootNode is the contents of /crfs. -// Children include: -// layers/ -- individual layers; directories by hostname/user/layer -// images/ -- merged layers; directories by hostname/user/layer -// README-crfs.txt -type rootNode struct { - fs *FS - dirEnts - lazyInode -} - -func (n *rootNode) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Inode = n.inode() - a.Valid = 30 * 24 * time.Hour - return nil -} - -func setDirAttr(a *fuse.Attr) { - a.Mode = 0755 | os.ModeDir - // TODO: more? -} - -// layersRoot is the contents of /crfs/layers/ -// -// Its children are hostnames (such as "gcr.io"). -// -// A special directory, "local", permits walking into stargz files on -// disk, local to the directory where crfs is running. This is useful for -// debugging. -type layersRoot struct { - fs *FS - inode uint64 - dirEnts -} - -func newLayersRoot(fs *FS, inode uint64) *layersRoot { - lr := &layersRoot{fs: fs, inode: inode} - lr.dirEnts.initChildren = func(de *dirEnts) { - de.m["local"] = &dirEnt{ - dtype: fuse.DT_Dir, - lookupNode: func(inode uint64) (fspkg.Node, error) { - return &layerDebugRoot{fs: fs, inode: inode}, nil - }, + if mode == LayerModeDirectory { + root.ForeachChild(func(baseName string, ent *stargz.TOCEntry) bool { + return createEntry(reader, ent, filepath.Join(fullpath, baseName), workdir, target, LayerModeFile) == nil + }) } - for _, n := range commonRegistryHostnames { - lr.addHostDirLocked(n) + skipChmod = true + case "symlink": + err := os.Symlink(root.LinkName, destpath) + if err != nil && !os.IsExist(err) { + return err } - } - return lr -} - -func (n *layersRoot) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Valid = 30 * 24 * time.Hour - a.Inode = n.inode - return nil -} - -var commonRegistryHostnames = []string{ - "gcr.io", - "us.gcr.io", - "eu.gcr.io", - "asia.gcr.io", - "index.docker.io", -} - -func isGCR(host string) bool { - return host == "gcr.io" || strings.HasSuffix(host, ".gcr.io") -} - -func (n *layersRoot) addHostDirLocked(name string) { - n.dirEnts.m[name] = &dirEnt{ - dtype: fuse.DT_Dir, - lookupNode: func(inode uint64) (fspkg.Node, error) { - return newLayerHost(n.fs, name, inode), nil - }, - } -} - -func (n *layersRoot) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - child, err := n.dirEnts.Lookup(ctx, name) - if err != fuse.ENOENT { - return child, err - } - // TODO: validate name looks like a hostname? - n.dirEnts.mu.Lock() - if _, ok := n.dirEnts.m[name]; !ok { - n.addHostDirLocked(name) - } - n.dirEnts.mu.Unlock() - return n.dirEnts.Lookup(ctx, name) -} - -// layerDebugRoot is /crfs/layers/local/ -// Its contents are *.star.gz files in the current directory. -type layerDebugRoot struct { - fs *FS - inode uint64 -} - -func (n *layerDebugRoot) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Inode = n.inode - return nil -} - -func (n *layerDebugRoot) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - fis, err := ioutil.ReadDir(".") - for _, fi := range fis { - name := fi.Name() - if !strings.HasSuffix(name, ".stargz") { - continue + skipChmod = true + case "hardlink": + err := os.Link(root.LinkName, destpath) + if err != nil && !os.IsExist(err) { + return err } - // TODO: populate inode number - ents = append(ents, fuse.Dirent{Type: fuse.DT_Dir, Name: name}) - } - return ents, err -} - -func (n *layerDebugRoot) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - f, err := os.Open(name) - if err != nil { - return nil, err - } - fi, err := f.Stat() - if err != nil { - f.Close() - return nil, err - } - r, err := stargz.Open(io.NewSectionReader(f, 0, fi.Size())) - if err != nil { - f.Close() - log.Printf("error opening local stargz: %v", err) - return nil, err - } - root, ok := r.Lookup("") - if !ok { - f.Close() - return nil, errors.New("failed to find root in stargz") - } - return &node{ - fs: n.fs, - te: root, - sr: r, - f: f, - }, nil -} - -// layerHost is, say, /crfs/layers/gcr.io/ (with host == "gcr.io") -// -// Its children are the next level (GCP project, docker hub owner), a layerHostOwner. -type layerHost struct { - fs *FS - host string - inode uint64 - dirEnts -} - -func newLayerHost(fs *FS, host string, inode uint64) *layerHost { - n := &layerHost{ - fs: fs, - host: host, - inode: inode, - } - n.dirEnts = dirEnts{ - initChildren: func(de *dirEnts) { - if !isGCR(n.host) || !metadata.OnGCE() { - return - } - if proj, _ := metadata.ProjectID(); proj != "" { - n.addLayerHostOwnerLocked(proj) - } - }, - } - return n -} - -func (n *layerHost) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Valid = 15 * time.Second - a.Inode = n.inode - return nil -} - -var gcpProjRE = regexp.MustCompile(`^[a-z]([-a-z0-9]*[a-z0-9])?$`) - -func (n *layerHost) addLayerHostOwnerLocked(owner string) { // owner == GCP project on gcr.io - n.dirEnts.m[owner] = &dirEnt{ - dtype: fuse.DT_Dir, - lookupNode: func(inode uint64) (fspkg.Node, error) { - return &layerHostOwner{ - fs: n.fs, - host: n.host, - owner: owner, - inode: inode, - }, nil - }, - } -} - -func (n *layerHost) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - child, err := n.dirEnts.Lookup(ctx, name) - if err != fuse.ENOENT { - return child, err - } - - // For gcr.io hosts, the next level lookup is the GCP project name, - // which we can validate. - if isGCR(n.host) { - proj := name - if len(name) < 6 || len(name) > 30 || !gcpProjRE.MatchString(proj) { - return nil, fuse.ENOENT + case "char": + err := syscall.Mknod(root.LinkName, unix.S_IFCHR, int(unix.Mkdev(uint32(root.DevMajor), uint32(root.DevMinor)))) + if err != nil && !os.IsExist(err) { + return err } - } else { - // TODO: validate index.docker.io next level lookups - } - - n.dirEnts.mu.Lock() - if _, ok := n.dirEnts.m[name]; !ok { - n.addLayerHostOwnerLocked(name) - } - n.dirEnts.mu.Unlock() - - return n.dirEnts.Lookup(ctx, name) -} - -// layerHostOwner is, say, /crfs/layers/gcr.io/foo-proj/ -// -// Its children are image names in that project. -type layerHostOwner struct { - fs *FS - inode uint64 - host string // "gcr.io" - owner string // "foo-proj" (GCP project, docker hub owner) -} - -func (n *layerHostOwner) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Inode = n.inode - return nil -} - -func (n *layerHostOwner) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - images, err := n.fs.imagesOfHost(ctx, n.host, n.owner) - if err != nil { - return nil, err - } - for _, name := range images { - ents = append(ents, fuse.Dirent{Type: fuse.DT_Dir, Name: name}) - } - return ents, nil -} - -func (n *layerHostOwner) Lookup(ctx context.Context, imageName string) (fspkg.Node, error) { - // TODO: auth, dockerhub, context - repo, err := namepkg.NewRepository(n.host + "/" + n.owner + "/" + imageName) - if err != nil { - log.Printf("bad name: %v", err) - return nil, err - } - tags, err := google.List(repo) - if err != nil { - log.Printf("list: %v", err) - return nil, err - } - m := map[string]string{} - for k, mi := range tags.Manifests { - for _, tag := range mi.Tags { - m[tag] = k + case "block": + err := syscall.Mknod(root.LinkName, unix.S_IFBLK, int(unix.Mkdev(uint32(root.DevMajor), uint32(root.DevMinor)))) + if err != nil && !os.IsExist(err) { + return err } + case "fifo": + err := syscall.Mknod(root.LinkName, unix.S_IFIFO, int(unix.Mkdev(uint32(root.DevMajor), uint32(root.DevMinor)))) + if err != nil && !os.IsExist(err) { + return err + } + case "chunk": + return nil } - return &layerHostOwnerImage{ - fs: n.fs, - inode: getOrMakeChildInode(n.inode, imageName), - host: n.host, - owner: n.owner, - image: imageName, - tags: tags, - tagsMap: m, - }, nil -} - -// layerHostOwnerImage is, say, /crfs/layers/gcr.io/foo-proj/ubuntu -// -// Its children are specific version of that image (in the form -// "sha256-7de52a7970a2d0a7d355c76e4f0e02b0e6ebc2841f64040062a27313761cc978", -// with hyphens instead of colons, for portability). -// -// And then also symlinks of tags to said ugly directories. -type layerHostOwnerImage struct { - fs *FS - inode uint64 - host string // "gcr.io" - owner string // "foo-proj" (GCP project, docker hub owner) - image string // "ubuntu" - tags *google.Tags - tagsMap map[string]string // "latest" -> "sha256:fooo" -} - -func (n *layerHostOwnerImage) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Inode = n.inode - return nil -} - -func uncolon(s string) string { return strings.Replace(s, ":", "-", 1) } -func recolon(s string) string { return strings.Replace(s, "-", ":", 1) } - -func (n *layerHostOwnerImage) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - for k := range n.tags.Manifests { - ents = append(ents, fuse.Dirent{Type: fuse.DT_Dir, Name: uncolon(k)}) - } - for k := range n.tagsMap { - ents = append(ents, fuse.Dirent{Type: fuse.DT_Link, Name: k}) - } - return ents, nil -} -func (n *layerHostOwnerImage) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - if targ, ok := n.tagsMap[name]; ok { - return symlinkNode(uncolon(targ)), nil + if !skipChmod { + if err := os.Chmod(destpath, os.FileMode(root.Mode)); err != nil { + return err + } } - - withColon := recolon(name) - if _, ok := n.tags.Manifests[withColon]; ok { - mf, err := n.fs.getManifest(ctx, n.host, n.owner, n.image, withColon) - if err != nil { - log.Printf("getManifest: %v", err) - return nil, err + if os.Geteuid() == 0 { + if err := os.Chown(destpath, root.Uid, root.Gid); err != nil { + return err } - return &layerHostOwnerImageReference{ - fs: n.fs, - inode: getOrMakeChildInode(n.inode, name), - host: n.host, - owner: n.owner, - image: n.image, - ref: withColon, - mf: mf, - }, nil } - return nil, fuse.ENOENT -} -// layerHostOwnerImageReference is a specific version of an image: -// /crfs/layers/gcr.io/foo-proj/ubuntu/sha256-7de52a7970a2d0a7d355c76e4f0e02b0e6ebc2841f64040062a27313761cc978 -type layerHostOwnerImageReference struct { - fs *FS - inode uint64 - host string // "gcr.io" - owner string // "foo-proj" (GCP project, docker hub owner) - image string // "ubuntu" - ref string // "sha256:xxxx" (with colon) - mf *manifest -} - -func (n *layerHostOwnerImageReference) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Valid = 30 * 24 * time.Hour - a.Inode = n.inode return nil } -func (n *layerHostOwnerImageReference) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - for i, layer := range n.mf.Layers { - ents = append(ents, fuse.Dirent{Type: fuse.DT_Dir, Name: uncolon(layer.Digest)}) - ents = append(ents, fuse.Dirent{Type: fuse.DT_Link, Name: strconv.Itoa(i)}) - } - ents = append(ents, fuse.Dirent{Type: fuse.DT_Link, Name: "top"}) - ents = append(ents, fuse.Dirent{Type: fuse.DT_Link, Name: "bottom"}) - ents = append(ents, fuse.Dirent{Type: fuse.DT_File, Name: "config"}) - return -} - -func (n *layerHostOwnerImageReference) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - i, err := strconv.Atoi(name) - if err == nil && i >= 0 && i < len(n.mf.Layers) { - return symlinkNode(uncolon(n.mf.Layers[i].Digest)), nil - } - if name == "top" { - return symlinkNode(fmt.Sprint(len(n.mf.Layers) - 1)), nil - } - if name == "bottom" { - return symlinkNode("0"), nil - } - if name == "config" { - conf, err := n.fs.getConfig(ctx, n.host, n.owner, n.image, n.mf.Config.Digest) +func serve(sockfd *os.File, reader *stargz.Reader, workdir, target string) error { + buffer := make([]byte, 4096) + for { + l, err := sockfd.Read(buffer) if err != nil { - log.Printf("getConfig: %v", err) - return nil, err + return err } - return &staticFile{contents: conf}, nil // TODO: add inode for staticFile - } - refColon := recolon(name) - var layerSize int64 - for _, layer := range n.mf.Layers { - if layer.Digest == refColon { - layerSize = layer.Size - break + parts := strings.Split(string(buffer[:l]), ":") + if len(parts) != 3 { + if _, err := sockfd.Write([]byte("1")); err != nil { + return err + } } - } - if layerSize == 0 { - return nil, fuse.ENOENT - } + parentdir, path, mode := parts[0], parts[1], parts[2] - // Probe the tar.gz. URL to see if it serves a redirect. - // - // gcr.io serves a redirect for the layer tar.gz blobs, but only on GET, not HEAD. - // So add a Range header to bound response size. gcr.io ignores the Range request header, - // but if gcr changes its behavior or we're hitting a different registry implementation, - // then we don't want to download the full thing. - urlStr := "https://" + n.host + "/v2/" + n.owner + "/" + n.image + "/blobs/" + refColon - req, err := http.NewRequest("GET", urlStr, nil) - if err != nil { - return nil, err - } - req = req.WithContext(ctx) - req.Header.Set("Range", "bytes=0-1") - // TODO: auth - res, err := http.DefaultTransport.RoundTrip(req) // NOT DefaultClient; don't want redirects - if err != nil { - return nil, err - } - defer res.Body.Close() - if res.StatusCode >= 400 { - log.Printf("hitting %s: %v", urlStr, res.Status) - return nil, syscall.EIO - } - if redir := res.Header.Get("Location"); redir != "" && res.StatusCode/100 == 3 { - urlStr = redir - } + fullpath := filepath.Join(parentdir, path) - sr := io.NewSectionReader(&urlReaderAt{url: urlStr}, 0, layerSize) - r, err := stargz.Open(sr) - if err != nil { - log.Printf("error opening remote stargz in %s: %v", urlStr, err) - return nil, err - } - root, ok := r.Lookup("") - if !ok { - return nil, errors.New("failed to find root in stargz") + if fullpath == "." { + fullpath = "" + } + root, found := reader.Lookup(fullpath) + if !found { + if _, err := sockfd.Write([]byte("0")); err != nil { + return err + } + continue + } + err = createEntry(reader, root, fullpath, workdir, target, mode) + if err == nil { + if _, err := sockfd.Write([]byte("0")); err != nil { + return err + } + } else if _, err := sockfd.Write([]byte("1")); err != nil { + return err + } } - return &node{ - fs: n.fs, - te: root, - sr: r, - }, nil + return nil } type urlReaderAt struct { - url string + url string + cache []byte + cacheRange string } func (r *urlReaderAt) ReadAt(p []byte, off int64) (n int, err error) { + rangeVal := fmt.Sprintf("bytes=%d-%d", off, off+int64(len(p))-1) + + if rangeVal == r.cacheRange { + return copy(p, r.cache), nil + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() req, err := http.NewRequest("GET", r.url, nil) @@ -790,282 +172,97 @@ func (r *urlReaderAt) ReadAt(p []byte, off int64) (n int, err error) { return 0, err } req = req.WithContext(ctx) - rangeVal := fmt.Sprintf("bytes=%d-%d", off, off+int64(len(p))-1) req.Header.Set("Range", rangeVal) - log.Printf("Fetching %s (%d at %d) of %s ...\n", rangeVal, len(p), off, r.url) + // TODO: auth res, err := http.DefaultTransport.RoundTrip(req) // NOT DefaultClient; don't want redirects if err != nil { - log.Printf("range read of %s: %v", r.url, err) return 0, err } defer res.Body.Close() if res.StatusCode != http.StatusPartialContent { - log.Printf("range read of %s: %v", r.url, res.Status) return 0, err } - return io.ReadFull(res.Body, p) -} - -type symlinkNode string // underlying is target - -func (s symlinkNode) Attr(ctx context.Context, a *fuse.Attr) error { - a.Mode = os.ModeSymlink | 0644 - // TODO: inode - return nil -} -func (s symlinkNode) Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (string, error) { - return string(s), nil -} - -// imagesRoot is the contents of /crfs/images/ -// Its children are hostnames (such as "gcr.io"). -type imagesRoot struct { - fs *FS - inode uint64 -} - -func (n *imagesRoot) Attr(ctx context.Context, a *fuse.Attr) error { - setDirAttr(a) - a.Valid = 30 * 24 * time.Hour - a.Inode = n.inode - return nil -} - -func (n *imagesRoot) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - for _, n := range commonRegistryHostnames { - ents = append(ents, fuse.Dirent{Type: fuse.DT_Dir, Name: n}) + n, err = io.ReadFull(res.Body, p) + if err == nil { + r.cache = p[:] + r.cacheRange = rangeVal } - return -} - -type staticFile struct { - contents string - inode uint64 + return n, err } -func (f *staticFile) Attr(ctx context.Context, a *fuse.Attr) error { - a.Mode = 0644 - a.Inode = f.inode - a.Size = uint64(len(f.contents)) - a.Blocks = blocksOf(a.Size) - return nil -} +func openLayer(data string) (*io.SectionReader, error) { + if strings.HasPrefix(data, "file://") { + path := data[len("file://"):] + f, err := os.Open(path) + if err != nil { + return nil, err + } -func (f *staticFile) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) error { - if req.Offset < 0 { - return syscall.EINVAL - } - if req.Offset > int64(len(f.contents)) { - resp.Data = nil - return nil - } - bufSize := int64(req.Size) - remain := int64(len(f.contents)) - req.Offset - if bufSize > remain { - bufSize = remain + fi, err := f.Stat() + if err != nil { + return nil, err + } + return io.NewSectionReader(f, 0, fi.Size()), nil } - resp.Data = make([]byte, bufSize) - n := copy(resp.Data, f.contents[req.Offset:]) - resp.Data = resp.Data[:n] // redundant, but for clarity - return nil -} - -func inodeOfEnt(ent *stargz.TOCEntry) uint64 { - return uint64(uintptr(unsafe.Pointer(ent))) -} + if strings.HasPrefix(data, "https://") { + res, err := http.Head(data) + if err != nil { + return nil, err + } + if res.ContentLength == 0 { + return nil, fmt.Errorf("invalid Content-Length for %s", data) + } -func direntType(ent *stargz.TOCEntry) fuse.DirentType { - switch ent.Type { - case "dir": - return fuse.DT_Dir - case "reg": - return fuse.DT_File - case "symlink": - return fuse.DT_Link - case "block": - return fuse.DT_Block - case "char": - return fuse.DT_Char - case "fifo": - return fuse.DT_FIFO + return io.NewSectionReader(&urlReaderAt{url: data}, 0, res.ContentLength), nil } - return fuse.DT_Unknown + return nil, fmt.Errorf("source %s is not supported", data) } -// node is a CRFS node in the FUSE filesystem. -// See https://godoc.org/bazil.org/fuse/fs#Node -type node struct { - fs *FS - te *stargz.TOCEntry - sr *stargz.Reader - f *os.File // non-nil if root & in debug mode -} - -var ( - _ fspkg.Node = (*node)(nil) - _ fspkg.NodeStringLookuper = (*node)(nil) - _ fspkg.NodeReadlinker = (*node)(nil) - _ fspkg.NodeOpener = (*node)(nil) - // TODO: implement NodeReleaser and n.f.Close() when n.f is non-nil - - _ fspkg.HandleReadDirAller = (*nodeHandle)(nil) - _ fspkg.HandleReader = (*nodeHandle)(nil) - - _ fspkg.HandleReadDirAller = (*rootNode)(nil) - _ fspkg.NodeStringLookuper = (*rootNode)(nil) -) - -func blocksOf(size uint64) (blocks uint64) { - blocks = size / 512 - if size%512 > 0 { - blocks++ +func main() { + if len(os.Args) < 4 { + fmt.Fprintln(os.Stderr, "wrong number of args") + os.Exit(1) } - return -} -// Attr populates a with the attributes of n. -// See https://godoc.org/bazil.org/fuse/fs#Node -func (n *node) Attr(ctx context.Context, a *fuse.Attr) error { - fi := n.te.Stat() - a.Valid = 30 * 24 * time.Hour - a.Inode = inodeOfEnt(n.te) - a.Size = uint64(fi.Size()) - a.Blocks = blocksOf(a.Size) - a.Mtime = fi.ModTime() - a.Mode = fi.Mode() - a.Uid = uint32(n.te.Uid) - a.Gid = uint32(n.te.Gid) - a.Rdev = uint32(unix.Mkdev(uint32(n.te.DevMajor), uint32(n.te.DevMinor))) - a.Nlink = 1 // TODO: get this from te once hardlinks are more supported - if debug { - log.Printf("attr of %s: %s", n.te.Name, *a) + dataB, err := base64.StdEncoding.DecodeString(os.Args[1]) + if err != nil { + fmt.Fprintln(os.Stderr, "invalid data %s: %v", os.Args[1], err) + os.Exit(1) } - return nil -} - -// ReadDirAll returns all directory entries in the directory node n. -// -// https://godoc.org/bazil.org/fuse/fs#HandleReadDirAller -func (h *nodeHandle) ReadDirAll(ctx context.Context) (ents []fuse.Dirent, err error) { - n := h.n - n.te.ForeachChild(func(baseName string, ent *stargz.TOCEntry) bool { - ents = append(ents, fuse.Dirent{ - Inode: inodeOfEnt(ent), - Type: direntType(ent), - Name: baseName, - }) - return true - }) - sort.Slice(ents, func(i, j int) bool { return ents[i].Name < ents[j].Name }) - return ents, nil -} + workdir := os.Args[2] + target := os.Args[3] -// Lookup looks up a child entry of the directory node n. -// -// See https://godoc.org/bazil.org/fuse/fs#NodeStringLookuper -func (n *node) Lookup(ctx context.Context, name string) (fspkg.Node, error) { - e, ok := n.te.LookupChild(name) - if !ok { - return nil, fuse.ENOENT - } - return &node{n.fs, e, n.sr, nil}, nil -} + data := string(dataB) -// Readlink reads the target of a symlink. -// -// See https://godoc.org/bazil.org/fuse/fs#NodeReadlinker -func (n *node) Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (string, error) { - if n.te.Type != "symlink" { - return "", syscall.EINVAL + if os.Getenv("_SOCKFD") == "" { + fmt.Fprintln(os.Stderr, "_SOCKFD not specified") + os.Exit(1) } - return n.te.LinkName, nil -} -func (n *node) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenResponse) (fspkg.Handle, error) { - h := &nodeHandle{ - n: n, - isDir: req.Dir, - } - resp.Handle = h.HandleID() - if !req.Dir { - var err error - h.sr, err = n.sr.OpenFile(n.te.Name) - if err != nil { - return nil, err - } + sockfd, err := strconv.Atoi(os.Getenv("_SOCKFD")) + if err != nil { + fmt.Fprintln(os.Stderr, "invalid _SOCKFD") + os.Exit(1) } - return h, nil -} - -// nodeHandle is a node that's been opened (opendir or for read). -type nodeHandle struct { - n *node - isDir bool - sr *io.SectionReader // of file bytes - - mu sync.Mutex - lastChunkOff int64 - lastChunkSize int - lastChunk []byte -} -func (h *nodeHandle) HandleID() fuse.HandleID { - return fuse.HandleID(uintptr(unsafe.Pointer(h))) -} + sock := os.NewFile(uintptr(sockfd), "SOCK FD") -func (h *nodeHandle) chunkData(offset int64, size int) ([]byte, error) { - h.mu.Lock() - if h.lastChunkOff == offset && h.lastChunkSize == size { - defer h.mu.Unlock() - if debug { - log.Printf("cache HIT, chunk off=%d/size=%d", offset, size) - } - return h.lastChunk, nil + sr, err := openLayer(data) + if err != nil { + fmt.Fprintf(os.Stderr, "cannot open source %s: %v\n", data, err) + os.Exit(1) } - h.mu.Unlock() - log.Printf("reading chunk for offset=%d, size=%d", offset, size) - buf := make([]byte, size) - n, err := h.sr.ReadAt(buf, offset) - log.Printf("... ReadAt = %v, %v", n, err) - if err == nil { - h.mu.Lock() - h.lastChunkOff = offset - h.lastChunkSize = size - h.lastChunk = buf - h.mu.Unlock() + r, err := stargz.Open(sr) + if err != nil { + fmt.Fprintf(os.Stderr, "cannot open stargz file %v: %v\n", data, err) + os.Exit(1) } - return buf, err -} - -// See https://godoc.org/bazil.org/fuse/fs#HandleReader -func (h *nodeHandle) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) error { - n := h.n - resp.Data = make([]byte, req.Size) - nr := 0 - offset := req.Offset - for nr < req.Size { - ce, ok := n.sr.ChunkEntryForOffset(n.te.Name, offset+int64(nr)) - if !ok { - break - } - if debug { - log.Printf("need chunk data for %q at %d (size=%d, for chunk from log %d-%d (%d), phys %d-%d (%d)) ...", - n.te.Name, req.Offset, req.Size, ce.ChunkOffset, ce.ChunkOffset+ce.ChunkSize, ce.ChunkSize, ce.Offset, ce.NextOffset(), ce.NextOffset()-ce.Offset) - } - chunkData, err := h.chunkData(ce.ChunkOffset, int(ce.ChunkSize)) - if err != nil { - return err - } - n := copy(resp.Data[nr:], chunkData) - nr += n + if err := serve(sock, r, workdir, target); err != nil { + fmt.Fprintln(os.Stderr, "error: %v", err) + os.Exit(1) } - resp.Data = resp.Data[:nr] - if debug { - log.Printf("Read response: size=%d @ %d, read %d", req.Size, req.Offset, nr) - } - return nil } diff --git a/plugin/Makefile b/plugin/Makefile new file mode 100644 index 0000000..9c6685e --- /dev/null +++ b/plugin/Makefile @@ -0,0 +1,2 @@ +crfs.so: crfs.c + $(CC) $< -fPIC -shared -o $@ diff --git a/plugin/crfs.c b/plugin/crfs.c new file mode 100644 index 0000000..1f091c4 --- /dev/null +++ b/plugin/crfs.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum + { + LAYER_MODE_METADATA = 1 << 0, + LAYER_MODE_DIRECTORY = 1 << 1, + LAYER_MODE_FILE = 1 << 2, + }; + +struct context +{ + int dirfd; + int workdirfd; + int pid; + int sockfd; +}; + +int +plugin_version () +{ + return 1; +} + +const char * +plugin_name () +{ + return "crfs"; +} + +void * +plugin_init (const char *data, const char *workdir, int workdirfd, const char *target, int dirfd) +{ + struct context *ctx = NULL; + int sockfd[2]; + pid_t pid; + + if (socketpair (AF_LOCAL, SOCK_DGRAM, 0, sockfd) < 0) + return NULL; + + pid = fork (); + if (pid < 0) + { + close (sockfd[0]); + close (sockfd[1]); + return NULL; + } + if (pid == 0) + { + char f[12]; + + close (sockfd[0]); + sprintf (f, "%d", sockfd[1]); + setenv ("_SOCKFD", f, 1); + prctl (PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0); + execlp ("crfs", "crfs", data, workdir, target, NULL); + fprintf (stderr, "cannot exec crfs: %s\n", strerror (errno)); + fclose (stderr); + _exit (EXIT_FAILURE); + } + + close (sockfd[1]); + + ctx = malloc (sizeof *ctx); + ctx->dirfd = dirfd; + ctx->workdirfd = workdirfd; + ctx->pid = pid; + ctx->sockfd = sockfd[0]; + + return ctx; +} + +int +plugin_fetch (void *opaque, const char *parentdir, const char *path, int mode) +{ + struct context *ctx = opaque; + char buffer[PATH_MAX*2+16]; + int written; + int len = sprintf (buffer, "%s:%s:%d", parentdir, path, mode); + + do + written = write (ctx->sockfd, buffer, len); + while (written < 0 && errno == EINTR); + if (written < 0) + return -1; + + do + len = read (ctx->sockfd, buffer, sizeof buffer); + while (len < 0 && errno == EINTR); + return 0; + if (len > 0 && buffer[0] == '0') + return 0; + + return -1; +} + +int +plugin_release (void *opaque) +{ + struct context *ctx = opaque; + + close (ctx->sockfd); + free (ctx); +} diff --git a/plugin/crfs.so b/plugin/crfs.so new file mode 100755 index 0000000000000000000000000000000000000000..a7585ee16550739650e60b623f5ef12bd79b3a2a GIT binary patch literal 17272 zcmeHP4Qw3M5q{^N#Kb0cLZA+&WE;gap>Wuw4(2Ctj&sh2#^8py3bb@{zB|WP&UfZ+ z4R!?7)C4%Lt}Fy71td&WA%uj4T0ui3ib+s@5KR+Qz@-G-LeO2YS_3HrlLmi*he{5iFeXhsZpjZM>{Ff&=`s)&w0896`(M++Ra0O7f(W?W6SNe#4HdA{}xuxl-ZY_IqaLMzhZF_91o&2Tsgia&`T*C8?NdUKdH&B{ppVH?LPU*id!0A zD~Z%+T3$ydP<{p+XovnS7B$-KzM92|eFY9IgHJu!e&2W3E!_-Mda*b7)U4~I1n!*{8Whwah&j9ou+`d2^WU3y*C~+l4c-b8ip`7 zZfQ3{;Y4_QBx!~dTiRQr@mP3Epeq_yvceK0I1n(pBe6gNJPnAt6oVKW@NN%SRxW>ka+!og@CxEW3) zVsRrH4+czZG-)OPi6@`}?m`P>+#QU@li+K30D9;RM8SX^=nk8~9&vLbVunQ`90)O< z2uH&Ku<1^O!y+2#3VM@q?;5dT)5dkJ#)aMsy-gUb68WmY^Xxx`Iwc->8>oNyUhs%T zaBcW~)PH8=6tt?N>&5S9ey>i`^{=4!Fyl`VUgN--pSVI@P<#jBJBiDr1?PJTWf=?3 zz9Bws!Rb~(Sj<@P(fqi4VTq4gaO-v1Wx?qcA@$vp8v!>0ZUo#2xDoh&j=)FN zXMd&-PgUsY@}n0Bq3;|wOL9l_;TJ3R$*|;_{seHY{#`g%FY`eS`5u&ICv#Bh-#{Lx zHrXSRe+7A*(qzXa{}S>z<;p%K`Mt>F)FwM7`8~+vlqNeW`ClN9Q*K<5VTSaTq zEA|lu{Z&%GX*rTrC-u~{{=#3b&|jD-)jj+5qbJQJ&_FviP?4MLu3ol&)*i3d(Awi5 z7X9Do)Q8uei&Z^!+^mA8FNbAYW+Ql+3BhT9`EwxjTn~Nc`$zli%^>Uqo85TXuHT1M z7{CV*1Y(A#e0u8E2|d++B-Qz{o@yUopYk8n5BPEU3WN1e>8Tg?VExy4rJu)kGB-jv zc8(7&*o}C0D`1(!0HypVhHpI~2G36Wr&0$pbpWOP(@3#G$ z=ygs*Uz7Ss({7TNF6Lj=+ z`?#L&Jff%l6D?A8W9p!uI;yAkLRWu=djnd02Z%EHJ(y&cpgRJ*{sT4uis|xaA=r8v zbej&K343>)?4kT2l7dMsASy&}{euUO?LCioQAX;g{4?OqOuF-6szy~_PIZ*2+KE(0 zMY?kenN&v=l0)i}-9F{hu;b?@-bSCEJ|fBe2Z5wUUPLCnQ;DGErxh@QQaydY)E(J5 z30w=@fg?>ztz&o%{DkY4_Mgy$7^a!D4{om!)YemnVMt9OnzACVu0DWv4 zLY|32ee}>ik$;cn-&_z|x402-Bj85Bjer{gHv(=1+z7Z4a3f%gKrqpr6!jrtTyu5n zx7yZ=U?3KYn;L#q(olAZR-arT>XVn?fAI0lQ&)N^etX6R`t)COxvfACeUQry0sS4& zF`$10Iu7)%k8-(5pxq~OIsW*;e}nhz*epB)HJ-Yv`4ywEj#~kGXTdQFZECo#TUgV! zu=dL8%9|^O#1&^=dhY5K%K?=-upAaG(1-k9)3UJU&XU$s=e0pYBnLOec_7C{yUEFVgz_Uopb*jF2S(i-aIVubx0N33@6uCfE=U+-BOm* z{<}Kjd5_td%#$3~QCL|1x0AUz^4suxA{5@^rcs6Ww(TN0??wAM`NwPl>ng zMWiPGtAzKlJ3C!N*SU@8cZhZqy@}|ZL?0mf7|~~l?kD;h(f5dcOmrS??mL6%IYieG z-O$>4iMA4Un6J_<_BMJOwF?^>FK%dRSgozx91dxEz*PA8jjQCnB%6hxZ)jQJSsiGc z&10bU@H#qmc=PEQ#KaTJ1??> zFNHtSaGruyR0=y(`MI>?rGlSFJ6A() z@v!3ygjNu*%-E4BvBQ;)9$WRlgf0 zu-~4a4VEv23gUhA%rO2n!uj`z@r$71vG|GoVQGT{c974BTQA_)KL5UYSDoXJ4&H;o&=o|_!Z^%2%G#8Djvv4a9-bWiO;>hhos@$`|VAs zfBKy3m?7MG-QR(MmEfZ++=!c3a|-QC`xm`$rO z%w7ZXRAS*IB({X&#`b8uD-bn8W;~HJ0{sIb81Lfl8e)|BI45MvxOS|9jU$q{Sf3QXDEEzX?04r^LuWQ+4T;0}ojem==rDfeFzacYfd_HR; zjWdtOA^oE4Bh2WrtE)sIJ0{b6?2t+cvSXP4V@D`yDW43a@`>y^j$|n^aU?U@O=K36 z15)Wx)_f>Vl~M^yc6CaNvV$^Dt6(mcN|>^1bF!2jHEda1b{QsQ*%2D7N}^IpS9S&A zO%C>&fi9qCLeUTysOTt_SH0veF+1bQPu z@Fxxp!^taq>q;ht7oN{vc;=L^a!lTUX(l3F{U$uq|AB(*OI)MDYeg2Gw=!KpIFmB2 z5XD`yAHy1q?YB{dX*22Z`V8mk)`D_;H?#dNsxZBqcnw=1A}W!w&!^@(>0{b{<9ua%%{XPEAu@6W~!0B(|>|x9wo}yp6BUI$4Qa< z=lJpXV<1QSb)?7hd8WKhM|;etbJ`C8hB2Y#^z1x_<}>{F2T+j(`?40i39a_LKFCu? zzuzQi6(Wce+e@!n&5UWf7jCt#cc&T(VE`97EcTWkNkUYno` zw^H^pdt!U0#~k*;N4gD)juFd*?U=p?0<5$DV?L!hM)s_yP~|Jvk-iTH`cLWz$-?%q z``uP39MT2_C*a_}u_4dv$(N}?jSWx;h2y9=)i>X}j3J$FRNN@N81V0o=nIdB-#_^G hJ082TQ0Jj$Mc;Qc?r=xWWeeHgaH+Ci;4pBo;-AxwOgI1l literal 0 HcmV?d00001