From 67a5a699ef5b54dc9d6370e25395a0be9754ab97 Mon Sep 17 00:00:00 2001 From: Will Scott Date: Wed, 5 Jun 2024 11:56:28 +0200 Subject: [PATCH 1/2] Re-factor cmd functions to library --- cmd/car/extract.go | 266 +-------------------------------------- cmd/car/filter.go | 92 +------------- cmd/car/get.go | 2 +- cmd/car/inspect.go | 116 +---------------- cmd/car/lib/extract.go | 276 +++++++++++++++++++++++++++++++++++++++++ cmd/car/lib/filter.go | 100 +++++++++++++++ cmd/car/lib/inspect.go | 167 +++++++++++++++++++++++++ cmd/car/lib/root.go | 25 ++++ cmd/car/lib/verify.go | 107 ++++++++++++++++ cmd/car/root.go | 15 +-- cmd/car/verify.go | 101 +-------------- 11 files changed, 689 insertions(+), 578 deletions(-) create mode 100644 cmd/car/lib/extract.go create mode 100644 cmd/car/lib/filter.go create mode 100644 cmd/car/lib/inspect.go create mode 100644 cmd/car/lib/root.go create mode 100644 cmd/car/lib/verify.go diff --git a/cmd/car/extract.go b/cmd/car/extract.go index a7bdff48..91c0c033 100644 --- a/cmd/car/extract.go +++ b/cmd/car/extract.go @@ -2,26 +2,18 @@ package main import ( "context" - "errors" "fmt" "io" "os" - "path" - "path/filepath" "runtime" "strings" "sync" "github.com/ipfs/go-cid" - "github.com/ipfs/go-unixfsnode" - "github.com/ipfs/go-unixfsnode/data" - "github.com/ipfs/go-unixfsnode/file" + "github.com/ipld/go-car/cmd/car/lib" "github.com/ipld/go-car/v2" carstorage "github.com/ipld/go-car/v2/storage" - dagpb "github.com/ipld/go-codec-dagpb" - "github.com/ipld/go-ipld-prime" cidlink "github.com/ipld/go-ipld-prime/linking/cid" - basicnode "github.com/ipld/go-ipld-prime/node/basic" "github.com/ipld/go-ipld-prime/storage" "github.com/urfave/cli/v2" ) @@ -86,7 +78,7 @@ func ExtractCar(c *cli.Context) error { var extractedFiles int for _, root := range roots { - count, err := extractRoot(c, &ls, root, outputDir, path) + count, err := lib.ExtractToDir(c.Context, &ls, root, outputDir, path, c.IsSet("verbose"), c.App.ErrWriter) if err != nil { return err } @@ -101,260 +93,6 @@ func ExtractCar(c *cli.Context) error { return nil } -func extractRoot(c *cli.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string, path []string) (int, error) { - if root.Prefix().Codec == cid.Raw { - if c.IsSet("verbose") { - fmt.Fprintf(c.App.ErrWriter, "skipping raw root %s\n", root) - } - return 0, nil - } - - pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: root}, dagpb.Type.PBNode) - if err != nil { - return 0, err - } - pbnode := pbn.(dagpb.PBNode) - - ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) - if err != nil { - return 0, err - } - - var outputResolvedDir string - if outputDir != "-" { - outputResolvedDir, err = filepath.EvalSymlinks(outputDir) - if err != nil { - return 0, err - } - if _, err := os.Stat(outputResolvedDir); os.IsNotExist(err) { - if err := os.Mkdir(outputResolvedDir, 0755); err != nil { - return 0, err - } - } - } - - count, err := extractDir(c, ls, ufn, outputResolvedDir, "/", path) - if err != nil { - if !errors.Is(err, ErrNotDir) { - return 0, fmt.Errorf("%s: %w", root, err) - } - - // if it's not a directory, it's a file. - ufsData, err := pbnode.LookupByString("Data") - if err != nil { - return 0, err - } - ufsBytes, err := ufsData.AsBytes() - if err != nil { - return 0, err - } - ufsNode, err := data.DecodeUnixFSData(ufsBytes) - if err != nil { - return 0, err - } - var outputName string - if outputDir != "-" { - outputName = filepath.Join(outputResolvedDir, "unknown") - } - if ufsNode.DataType.Int() == data.Data_File || ufsNode.DataType.Int() == data.Data_Raw { - if err := extractFile(c, ls, pbnode, outputName); err != nil { - return 0, err - } - } - return 1, nil - } - - return count, nil -} - -func resolvePath(root, pth string) (string, error) { - rp, err := filepath.Rel("/", pth) - if err != nil { - return "", fmt.Errorf("couldn't check relative-ness of %s: %w", pth, err) - } - joined := path.Join(root, rp) - - basename := path.Dir(joined) - final, err := filepath.EvalSymlinks(basename) - if err != nil { - return "", fmt.Errorf("couldn't eval symlinks in %s: %w", basename, err) - } - if final != path.Clean(basename) { - return "", fmt.Errorf("path attempts to redirect through symlinks") - } - return joined, nil -} - -func extractDir(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputRoot, outputPath string, matchPath []string) (int, error) { - if outputRoot != "" { - dirPath, err := resolvePath(outputRoot, outputPath) - if err != nil { - return 0, err - } - // make the directory. - if err := os.MkdirAll(dirPath, 0755); err != nil { - return 0, err - } - } - - if n.Kind() != ipld.Kind_Map { - return 0, ErrNotDir - } - - subPath := matchPath - if len(matchPath) > 0 { - subPath = matchPath[1:] - } - - extractElement := func(name string, n ipld.Node) (int, error) { - var nextRes string - if outputRoot != "" { - var err error - nextRes, err = resolvePath(outputRoot, path.Join(outputPath, name)) - if err != nil { - return 0, err - } - if c.IsSet("verbose") { - fmt.Fprintf(c.App.ErrWriter, "%s\n", nextRes) - } - } - - if n.Kind() != ipld.Kind_Link { - return 0, fmt.Errorf("unexpected map value for %s at %s", name, outputPath) - } - // a directory may be represented as a map of name: if unixADL is applied - vl, err := n.AsLink() - if err != nil { - return 0, err - } - dest, err := ls.Load(ipld.LinkContext{}, vl, basicnode.Prototype.Any) - if err != nil { - if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { - fmt.Fprintf(c.App.ErrWriter, "data for entry not found: %s (skipping...)\n", path.Join(outputPath, name)) - return 0, nil - } - return 0, err - } - // degenerate files are handled here. - if dest.Kind() == ipld.Kind_Bytes { - if err := extractFile(c, ls, dest, nextRes); err != nil { - return 0, err - } - return 1, nil - } - - // dir / pbnode - pbb := dagpb.Type.PBNode.NewBuilder() - if err := pbb.AssignNode(dest); err != nil { - return 0, err - } - pbnode := pbb.Build().(dagpb.PBNode) - - // interpret dagpb 'data' as unixfs data and look at type. - ufsData, err := pbnode.LookupByString("Data") - if err != nil { - return 0, err - } - ufsBytes, err := ufsData.AsBytes() - if err != nil { - return 0, err - } - ufsNode, err := data.DecodeUnixFSData(ufsBytes) - if err != nil { - return 0, err - } - - switch ufsNode.DataType.Int() { - case data.Data_Directory, data.Data_HAMTShard: - ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) - if err != nil { - return 0, err - } - return extractDir(c, ls, ufn, outputRoot, path.Join(outputPath, name), subPath) - case data.Data_File, data.Data_Raw: - if err := extractFile(c, ls, pbnode, nextRes); err != nil { - return 0, err - } - return 1, nil - case data.Data_Symlink: - if nextRes == "" { - return 0, fmt.Errorf("cannot extract a symlink to stdout") - } - data := ufsNode.Data.Must().Bytes() - if err := os.Symlink(string(data), nextRes); err != nil { - return 0, err - } - return 1, nil - default: - return 0, fmt.Errorf("unknown unixfs type: %d", ufsNode.DataType.Int()) - } - } - - // specific path segment - if len(matchPath) > 0 { - val, err := n.LookupByString(matchPath[0]) - if err != nil { - return 0, err - } - return extractElement(matchPath[0], val) - } - - if outputPath == "-" && len(matchPath) == 0 { - return 0, fmt.Errorf("cannot extract a directory to stdout, use a path to extract a specific file") - } - - // everything - var count int - var shardSkip int - mi := n.MapIterator() - for !mi.Done() { - key, val, err := mi.Next() - if err != nil { - if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { - shardSkip++ - continue - } - return 0, err - } - ks, err := key.AsString() - if err != nil { - return 0, err - } - ecount, err := extractElement(ks, val) - if err != nil { - return 0, err - } - count += ecount - } - if shardSkip > 0 { - fmt.Fprintf(c.App.ErrWriter, "data for entry not found for %d unknown sharded entries (skipped...)\n", shardSkip) - } - return count, nil -} - -func extractFile(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputName string) error { - node, err := file.NewUnixFSFile(c.Context, n, ls) - if err != nil { - return err - } - nlr, err := node.AsLargeBytes() - if err != nil { - return err - } - var f *os.File - if outputName == "" { - f = os.Stdout - } else { - f, err = os.Create(outputName) - if err != nil { - return err - } - defer f.Close() - } - _, err = io.Copy(f, nlr) - return err -} - // TODO: dedupe this with lassie, probably into go-unixfsnode func pathSegments(path string) ([]string, error) { segments := strings.Split(path, "/") diff --git a/cmd/car/filter.go b/cmd/car/filter.go index 3c163049..d4c6c1d5 100644 --- a/cmd/car/filter.go +++ b/cmd/car/filter.go @@ -8,8 +8,7 @@ import ( "strings" "github.com/ipfs/go-cid" - carv2 "github.com/ipld/go-car/v2" - "github.com/ipld/go-car/v2/blockstore" + "github.com/ipld/go-car/cmd/car/lib" "github.com/urfave/cli/v2" ) @@ -19,16 +18,7 @@ func FilterCar(c *cli.Context) error { return fmt.Errorf("an output filename must be provided") } - fd, err := os.Open(c.Args().First()) - if err != nil { - return err - } - defer fd.Close() - rd, err := carv2.NewBlockReader(fd) - if err != nil { - return err - } - + var err error // Get the set of CIDs from stdin. inStream := os.Stdin if c.IsSet("cidFile") { @@ -48,83 +38,7 @@ func FilterCar(c *cli.Context) error { fmt.Printf("filtering to %d cids\n", len(cidMap)) } - outRoots := make([]cid.Cid, 0) - for _, r := range rd.Roots { - if matchFilter(c, r, cidMap) { - outRoots = append(outRoots, r) - } - } - - version := c.Int("version") - options := []carv2.Option{} - switch version { - case 1: - options = []carv2.Option{blockstore.WriteAsCarV1(true)} - case 2: - // already the default - default: - return fmt.Errorf("invalid CAR version %d", c.Int("version")) - } - - outPath := c.Args().Get(1) - if !c.Bool("append") { - if _, err := os.Stat(outPath); err == nil || !os.IsNotExist(err) { - // output to an existing file. - if err := os.Truncate(outPath, 0); err != nil { - return err - } - } - } else { - if version != 2 { - return fmt.Errorf("can only append to version 2 car files") - } - - // roots will need to be whatever is in the output already. - cv2r, err := carv2.OpenReader(outPath) - if err != nil { - return err - } - if cv2r.Version != 2 { - return fmt.Errorf("can only append to version 2 car files") - } - outRoots, err = cv2r.Roots() - if err != nil { - return err - } - _ = cv2r.Close() - } - - if len(outRoots) == 0 { - fmt.Fprintf(os.Stderr, "warning: no roots defined after filtering\n") - } - - bs, err := blockstore.OpenReadWrite(outPath, outRoots, options...) - if err != nil { - return err - } - - for { - blk, err := rd.Next() - if err != nil { - if err == io.EOF { - break - } - return err - } - if matchFilter(c, blk.Cid(), cidMap) { - if err := bs.Put(c.Context, blk); err != nil { - return err - } - } - } - return bs.Finalize() -} - -func matchFilter(ctx *cli.Context, c cid.Cid, cidMap map[cid.Cid]struct{}) bool { - if _, ok := cidMap[c]; ok { - return !ctx.Bool("inverse") - } - return ctx.Bool("inverse") + return lib.FilterCar(c.Context, c.Args().First(), c.Args().Get(1), cidMap, c.Bool("invert"), c.Int("version"), c.Bool("append")) } func parseCIDS(r io.Reader) (map[cid.Cid]struct{}, error) { diff --git a/cmd/car/get.go b/cmd/car/get.go index f5d5b1cd..df71f4d6 100644 --- a/cmd/car/get.go +++ b/cmd/car/get.go @@ -199,7 +199,7 @@ func writeCarV2(ctx context.Context, rootCid cid.Cid, output string, bs *blockst return outStore.Finalize() } -func writeCarV1(rootCid cid.Cid, output string, bs *blockstore.ReadOnly, strict bool, sel datamodel.Node, linkVisitOnlyOnce bool) error { +func writeCarV1(rootCid cid.Cid, output string, bs *blockstore.ReadOnly, _ bool, sel datamodel.Node, linkVisitOnlyOnce bool) error { opts := make([]car.Option, 0) if linkVisitOnlyOnce { opts = append(opts, car.TraverseLinksOnlyOnce()) diff --git a/cmd/car/inspect.go b/cmd/car/inspect.go index f320500c..512cd8ab 100644 --- a/cmd/car/inspect.go +++ b/cmd/car/inspect.go @@ -1,15 +1,10 @@ package main import ( - "bytes" "fmt" - "io" "os" - "sort" - "strings" - carv2 "github.com/ipld/go-car/v2" - "github.com/multiformats/go-multicodec" + "github.com/ipld/go-car/cmd/car/lib" "github.com/urfave/cli/v2" ) @@ -23,115 +18,10 @@ func InspectCar(c *cli.Context) (err error) { } } - rd, err := carv2.NewReader(inStream, carv2.ZeroLengthSectionAsEOF(true)) + rep, err := lib.InspectCar(inStream, c.Bool("full")) if err != nil { return err } - stats, err := rd.Inspect(c.IsSet("full")) - if err != nil { - return err - } - - if stats.Version == 1 && c.IsSet("full") { // check that we've read all the data - got, err := inStream.Read(make([]byte, 1)) // force EOF - if err != nil && err != io.EOF { - return err - } else if got > 0 { - return fmt.Errorf("unexpected data after EOF: %d", got) - } - } - - var v2s string - if stats.Version == 2 { - idx := "(none)" - if stats.IndexCodec != 0 { - idx = stats.IndexCodec.String() - } - var buf bytes.Buffer - stats.Header.Characteristics.WriteTo(&buf) - v2s = fmt.Sprintf(`Characteristics: %x -Data offset: %d -Data (payload) length: %d -Index offset: %d -Index type: %s -`, buf.Bytes(), stats.Header.DataOffset, stats.Header.DataSize, stats.Header.IndexOffset, idx) - } - - var roots strings.Builder - switch len(stats.Roots) { - case 0: - roots.WriteString(" (none)") - case 1: - roots.WriteString(" ") - roots.WriteString(stats.Roots[0].String()) - default: - for _, r := range stats.Roots { - roots.WriteString("\n\t") - roots.WriteString(r.String()) - } - } - - var codecs strings.Builder - { - keys := make([]int, len(stats.CodecCounts)) - i := 0 - for codec := range stats.CodecCounts { - keys[i] = int(codec) - i++ - } - sort.Ints(keys) - for _, code := range keys { - codec := multicodec.Code(code) - codecs.WriteString(fmt.Sprintf("\n\t%s: %d", codec, stats.CodecCounts[codec])) - } - } - - var hashers strings.Builder - { - keys := make([]int, len(stats.MhTypeCounts)) - i := 0 - for codec := range stats.MhTypeCounts { - keys[i] = int(codec) - i++ - } - sort.Ints(keys) - for _, code := range keys { - codec := multicodec.Code(code) - hashers.WriteString(fmt.Sprintf("\n\t%s: %d", codec, stats.MhTypeCounts[codec])) - } - } - - rp := "No" - if stats.RootsPresent { - rp = "Yes" - } - - pfmt := `Version: %d -%sRoots:%s -Root blocks present in data: %s -Block count: %d -Min / average / max block length (bytes): %d / %d / %d -Min / average / max CID length (bytes): %d / %d / %d -Block count per codec:%s -CID count per multihash:%s -` - - fmt.Printf( - pfmt, - stats.Version, - v2s, - roots.String(), - rp, - stats.BlockCount, - stats.MinBlockLength, - stats.AvgBlockLength, - stats.MaxBlockLength, - stats.MinCidLength, - stats.AvgCidLength, - stats.MaxCidLength, - codecs.String(), - hashers.String(), - ) - + fmt.Print(rep.String()) return nil } diff --git a/cmd/car/lib/extract.go b/cmd/car/lib/extract.go new file mode 100644 index 00000000..6ab6bebc --- /dev/null +++ b/cmd/car/lib/extract.go @@ -0,0 +1,276 @@ +package lib + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path" + "path/filepath" + + "github.com/ipfs/go-cid" + "github.com/ipfs/go-unixfsnode" + "github.com/ipfs/go-unixfsnode/data" + "github.com/ipfs/go-unixfsnode/file" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/ipld/go-ipld-prime/node/basicnode" +) + +var ErrNotDir = fmt.Errorf("not a directory") + +func ExtractToDir(c context.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string, path []string, verbose bool, logger io.Writer) (int, error) { + if root.Prefix().Codec == cid.Raw { + if verbose { + fmt.Fprintf(logger, "skipping raw root %s\n", root) + } + return 0, nil + } + + pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: root}, dagpb.Type.PBNode) + if err != nil { + return 0, err + } + pbnode := pbn.(dagpb.PBNode) + + ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) + if err != nil { + return 0, err + } + + var outputResolvedDir string + if outputDir != "-" { + outputResolvedDir, err = filepath.EvalSymlinks(outputDir) + if err != nil { + return 0, err + } + if _, err := os.Stat(outputResolvedDir); os.IsNotExist(err) { + if err := os.Mkdir(outputResolvedDir, 0755); err != nil { + return 0, err + } + } + } + + count, err := extractDir(c, ls, ufn, outputResolvedDir, "/", path, verbose, logger) + if err != nil { + if !errors.Is(err, ErrNotDir) { + return 0, fmt.Errorf("%s: %w", root, err) + } + + // if it's not a directory, it's a file. + ufsData, err := pbnode.LookupByString("Data") + if err != nil { + return 0, err + } + ufsBytes, err := ufsData.AsBytes() + if err != nil { + return 0, err + } + ufsNode, err := data.DecodeUnixFSData(ufsBytes) + if err != nil { + return 0, err + } + var outputName string + if outputDir != "-" { + outputName = filepath.Join(outputResolvedDir, "unknown") + } + if ufsNode.DataType.Int() == data.Data_File || ufsNode.DataType.Int() == data.Data_Raw { + if err := extractFile(c, ls, pbnode, outputName); err != nil { + return 0, err + } + } + return 1, nil + } + + return count, nil +} + +func resolvePath(root, pth string) (string, error) { + rp, err := filepath.Rel("/", pth) + if err != nil { + return "", fmt.Errorf("couldn't check relative-ness of %s: %w", pth, err) + } + joined := path.Join(root, rp) + + basename := path.Dir(joined) + final, err := filepath.EvalSymlinks(basename) + if err != nil { + return "", fmt.Errorf("couldn't eval symlinks in %s: %w", basename, err) + } + if final != path.Clean(basename) { + return "", fmt.Errorf("path attempts to redirect through symlinks") + } + return joined, nil +} + +func extractDir(c context.Context, ls *ipld.LinkSystem, n ipld.Node, outputRoot, outputPath string, matchPath []string, verbose bool, logger io.Writer) (int, error) { + if outputRoot != "" { + dirPath, err := resolvePath(outputRoot, outputPath) + if err != nil { + return 0, err + } + // make the directory. + if err := os.MkdirAll(dirPath, 0755); err != nil { + return 0, err + } + } + + if n.Kind() != ipld.Kind_Map { + return 0, ErrNotDir + } + + subPath := matchPath + if len(matchPath) > 0 { + subPath = matchPath[1:] + } + + extractElement := func(name string, n ipld.Node) (int, error) { + var nextRes string + if outputRoot != "" { + var err error + nextRes, err = resolvePath(outputRoot, path.Join(outputPath, name)) + if err != nil { + return 0, err + } + if verbose { + fmt.Fprintf(logger, "%s\n", nextRes) + } + } + + if n.Kind() != ipld.Kind_Link { + return 0, fmt.Errorf("unexpected map value for %s at %s", name, outputPath) + } + // a directory may be represented as a map of name: if unixADL is applied + vl, err := n.AsLink() + if err != nil { + return 0, err + } + dest, err := ls.Load(ipld.LinkContext{}, vl, basicnode.Prototype.Any) + if err != nil { + if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { + fmt.Fprintf(logger, "data for entry not found: %s (skipping...)\n", path.Join(outputPath, name)) + return 0, nil + } + return 0, err + } + // degenerate files are handled here. + if dest.Kind() == ipld.Kind_Bytes { + if err := extractFile(c, ls, dest, nextRes); err != nil { + return 0, err + } + return 1, nil + } + + // dir / pbnode + pbb := dagpb.Type.PBNode.NewBuilder() + if err := pbb.AssignNode(dest); err != nil { + return 0, err + } + pbnode := pbb.Build().(dagpb.PBNode) + + // interpret dagpb 'data' as unixfs data and look at type. + ufsData, err := pbnode.LookupByString("Data") + if err != nil { + return 0, err + } + ufsBytes, err := ufsData.AsBytes() + if err != nil { + return 0, err + } + ufsNode, err := data.DecodeUnixFSData(ufsBytes) + if err != nil { + return 0, err + } + + switch ufsNode.DataType.Int() { + case data.Data_Directory, data.Data_HAMTShard: + ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) + if err != nil { + return 0, err + } + return extractDir(c, ls, ufn, outputRoot, path.Join(outputPath, name), subPath, verbose, logger) + case data.Data_File, data.Data_Raw: + if err := extractFile(c, ls, pbnode, nextRes); err != nil { + return 0, err + } + return 1, nil + case data.Data_Symlink: + if nextRes == "" { + return 0, fmt.Errorf("cannot extract a symlink to stdout") + } + data := ufsNode.Data.Must().Bytes() + if err := os.Symlink(string(data), nextRes); err != nil { + return 0, err + } + return 1, nil + default: + return 0, fmt.Errorf("unknown unixfs type: %d", ufsNode.DataType.Int()) + } + } + + // specific path segment + if len(matchPath) > 0 { + val, err := n.LookupByString(matchPath[0]) + if err != nil { + return 0, err + } + return extractElement(matchPath[0], val) + } + + if outputPath == "-" && len(matchPath) == 0 { + return 0, fmt.Errorf("cannot extract a directory to stdout, use a path to extract a specific file") + } + + // everything + var count int + var shardSkip int + mi := n.MapIterator() + for !mi.Done() { + key, val, err := mi.Next() + if err != nil { + if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { + shardSkip++ + continue + } + return 0, err + } + ks, err := key.AsString() + if err != nil { + return 0, err + } + ecount, err := extractElement(ks, val) + if err != nil { + return 0, err + } + count += ecount + } + if shardSkip > 0 { + fmt.Fprintf(logger, "data for entry not found for %d unknown sharded entries (skipped...)\n", shardSkip) + } + return count, nil +} + +func extractFile(c context.Context, ls *ipld.LinkSystem, n ipld.Node, outputName string) error { + node, err := file.NewUnixFSFile(c, n, ls) + if err != nil { + return err + } + nlr, err := node.AsLargeBytes() + if err != nil { + return err + } + var f *os.File + if outputName == "" { + f = os.Stdout + } else { + f, err = os.Create(outputName) + if err != nil { + return err + } + defer f.Close() + } + _, err = io.Copy(f, nlr) + return err +} diff --git a/cmd/car/lib/filter.go b/cmd/car/lib/filter.go new file mode 100644 index 00000000..e55f1549 --- /dev/null +++ b/cmd/car/lib/filter.go @@ -0,0 +1,100 @@ +package lib + +import ( + "context" + "fmt" + "io" + "os" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" +) + +func FilterCar(ctx context.Context, infile, outfile string, cidMap map[cid.Cid]struct{}, invert bool, outVersion int, appendOutFile bool) error { + fd, err := os.Open(infile) + if err != nil { + return err + } + defer fd.Close() + rd, err := carv2.NewBlockReader(fd) + if err != nil { + return err + } + + outRoots := make([]cid.Cid, 0) + for _, r := range rd.Roots { + if matchFilter(r, cidMap, invert) { + outRoots = append(outRoots, r) + } + } + + options := []carv2.Option{} + switch outVersion { + case 1: + options = []carv2.Option{blockstore.WriteAsCarV1(true)} + case 2: + // already the default + default: + return fmt.Errorf("invalid CAR version %d", outVersion) + } + + if !appendOutFile { + if _, err := os.Stat(outfile); err == nil || !os.IsNotExist(err) { + // output to an existing file. + if err := os.Truncate(outfile, 0); err != nil { + return err + } + } + } else { + if outVersion != 2 { + return fmt.Errorf("can only append to version 2 car files") + } + + // roots will need to be whatever is in the output already. + cv2r, err := carv2.OpenReader(outfile) + if err != nil { + return err + } + if cv2r.Version != 2 { + return fmt.Errorf("can only append to version 2 car files") + } + outRoots, err = cv2r.Roots() + if err != nil { + return err + } + _ = cv2r.Close() + } + + if len(outRoots) == 0 { + fmt.Fprintf(os.Stderr, "warning: no roots defined after filtering\n") + } + + bs, err := blockstore.OpenReadWrite(outfile, outRoots, options...) + if err != nil { + return err + } + + for { + blk, err := rd.Next() + if err != nil { + if err == io.EOF { + break + } + return err + } + if matchFilter(blk.Cid(), cidMap, invert) { + if err := bs.Put(ctx, blk); err != nil { + return err + } + } + } + return bs.Finalize() +} + +func matchFilter(c cid.Cid, cidMap map[cid.Cid]struct{}, invert bool) bool { + if _, ok := cidMap[c]; ok { + return !invert + } + return invert +} diff --git a/cmd/car/lib/inspect.go b/cmd/car/lib/inspect.go new file mode 100644 index 00000000..e8db011d --- /dev/null +++ b/cmd/car/lib/inspect.go @@ -0,0 +1,167 @@ +package lib + +import ( + "bytes" + "fmt" + "io" + "os" + "sort" + "strings" + + carv2 "github.com/ipld/go-car/v2" + "github.com/multiformats/go-multicodec" +) + +type Stat struct { + Min, Mean, Max uint64 +} + +func (s Stat) String() string { + return fmt.Sprintf("%d / %d / %d", s.Min, s.Mean, s.Max) +} + +type Roots []string + +func (r Roots) String() string { + var roots strings.Builder + switch len(r) { + case 0: + roots.WriteString(" (none)") + case 1: + roots.WriteString(" ") + roots.WriteString(r[0]) + default: + for _, root := range r { + roots.WriteString("\n\t") + roots.WriteString(root) + } + } + return roots.String() +} + +type Counts map[multicodec.Code]uint64 + +func (cs Counts) String() string { + var codecs strings.Builder + { + keys := make([]int, len(cs)) + i := 0 + for codec := range cs { + keys[i] = int(codec) + i++ + } + sort.Ints(keys) + for _, code := range keys { + codec := multicodec.Code(code) + codecs.WriteString(fmt.Sprintf("\n\t%s: %d", codec, cs[codec])) + } + } + return codecs.String() +} + +type Report struct { + Characteristics []byte + DataOffset uint64 + DataLength uint64 + IndexOffset uint64 + IndexType string + Version int + Roots Roots + RootsPresent bool + BlockCount uint64 + BlkLength Stat + CidLength Stat + Codecs Counts + Hashes Counts +} + +func (r *Report) String() string { + var v2s string + if r.Version == 2 { + v2s = fmt.Sprintf(`Characteristics: %x +Data offset: %d +Data (payload) length: %d +Index offset: %d +Index type: %s +`, r.Characteristics, r.DataOffset, r.DataLength, r.IndexOffset, r.IndexType) + } + + rp := "No" + if r.RootsPresent { + rp = "Yes" + } + + pfmt := `Version: %d +%sRoots:%s +Root blocks present in data: %s +Block count: %d +Min / average / max block length (bytes): %s +Min / average / max CID length (bytes): %s +Block count per codec:%s +CID count per multihash:%s +` + + return fmt.Sprintf( + pfmt, + r.Version, + v2s, + r.Roots.String(), + rp, + r.BlockCount, + r.BlkLength.String(), + r.CidLength.String(), + r.Codecs.String(), + r.Hashes.String(), + ) +} + +func InspectCar(inStream *os.File, verifyHashes bool) (*Report, error) { + rd, err := carv2.NewReader(inStream, carv2.ZeroLengthSectionAsEOF(true)) + if err != nil { + return nil, err + } + stats, err := rd.Inspect(verifyHashes) + if err != nil { + return nil, err + } + + if stats.Version == 1 && verifyHashes { // check that we've read all the data + got, err := inStream.Read(make([]byte, 1)) // force EOF + if err != nil && err != io.EOF { + return nil, err + } else if got > 0 { + return nil, fmt.Errorf("unexpected data after EOF: %d", got) + } + } + + rep := Report{ + Version: int(stats.Version), + Roots: []string{}, + RootsPresent: stats.RootsPresent, + BlockCount: stats.BlockCount, + BlkLength: Stat{Min: stats.MinBlockLength, Mean: stats.AvgBlockLength, Max: stats.MaxBlockLength}, + CidLength: Stat{Min: stats.MinCidLength, Mean: stats.AvgCidLength, Max: stats.MaxCidLength}, + Codecs: stats.CodecCounts, + Hashes: stats.MhTypeCounts, + } + + for _, c := range stats.Roots { + rep.Roots = append(rep.Roots, c.String()) + } + + if stats.Version == 2 { + idx := "(none)" + if stats.IndexCodec != 0 { + idx = stats.IndexCodec.String() + } + var buf bytes.Buffer + stats.Header.Characteristics.WriteTo(&buf) + rep.Characteristics = buf.Bytes() + rep.DataOffset = stats.Header.DataOffset + rep.DataLength = stats.Header.DataSize + rep.IndexOffset = stats.Header.IndexOffset + rep.IndexType = idx + } + + return &rep, nil +} diff --git a/cmd/car/lib/root.go b/cmd/car/lib/root.go new file mode 100644 index 00000000..d51347b0 --- /dev/null +++ b/cmd/car/lib/root.go @@ -0,0 +1,25 @@ +package lib + +import ( + "os" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" +) + +// CarRoot prints the root CID in a car +func CarRoot(file string) (roots []cid.Cid, err error) { + inStream := os.Stdin + if len(file) >= 1 { + inStream, err = os.Open(file) + if err != nil { + return nil, err + } + } + + rd, err := carv2.NewBlockReader(inStream) + if err != nil { + return nil, err + } + return rd.Roots, nil +} diff --git a/cmd/car/lib/verify.go b/cmd/car/lib/verify.go new file mode 100644 index 00000000..cf30a9b0 --- /dev/null +++ b/cmd/car/lib/verify.go @@ -0,0 +1,107 @@ +package lib + +import ( + "fmt" + "io" + "os" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/multiformats/go-multihash" +) + +func VerifyCar(file string) error { + // header + rx, err := carv2.OpenReader(file) + if err != nil { + return err + } + defer rx.Close() + roots, err := rx.Roots() + if err != nil { + return err + } + if len(roots) == 0 { + return fmt.Errorf("no roots listed in car header") + } + rootMap := make(map[cid.Cid]struct{}) + for _, r := range roots { + rootMap[r] = struct{}{} + } + + if rx.Version == 2 { + if rx.Header.DataSize == 0 { + return fmt.Errorf("size of wrapped v1 car listed as '0'") + } + + flen, err := os.Stat(file) + if err != nil { + return err + } + lengthToIndex := carv2.PragmaSize + carv2.HeaderSize + rx.Header.DataSize + if uint64(flen.Size()) > lengthToIndex && rx.Header.IndexOffset == 0 { + return fmt.Errorf("header claims no index, but extra bytes in file beyond data size") + } + if rx.Header.DataOffset < carv2.PragmaSize+carv2.HeaderSize { + return fmt.Errorf("data offset places data within carv2 header") + } + if rx.Header.IndexOffset < lengthToIndex { + return fmt.Errorf("index offset overlaps with data. data ends at %d. index offset of %d", lengthToIndex, rx.Header.IndexOffset) + } + } + + // blocks + fd, err := os.Open(file) + if err != nil { + return err + } + rd, err := carv2.NewBlockReader(fd) + if err != nil { + return err + } + + cidList := make([]cid.Cid, 0) + for { + blk, err := rd.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + delete(rootMap, blk.Cid()) + cidList = append(cidList, blk.Cid()) + } + + if len(rootMap) > 0 { + return fmt.Errorf("header lists root(s) not present as a block: %v", rootMap) + } + + // index + if rx.Version == 2 && rx.Header.HasIndex() { + ir, err := rx.IndexReader() + if err != nil { + return err + } + idx, err := index.ReadFrom(ir) + if err != nil { + return err + } + for _, c := range cidList { + cidHash, err := multihash.Decode(c.Hash()) + if err != nil { + return err + } + if cidHash.Code == multihash.IDENTITY { + continue + } + if err := idx.GetAll(c, func(_ uint64) bool { + return true + }); err != nil { + return fmt.Errorf("could not look up known cid %s in index: %w", c, err) + } + } + } + return nil +} diff --git a/cmd/car/root.go b/cmd/car/root.go index 7e8d5b2c..136fb94c 100644 --- a/cmd/car/root.go +++ b/cmd/car/root.go @@ -2,27 +2,18 @@ package main import ( "fmt" - "os" - carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/cmd/car/lib" "github.com/urfave/cli/v2" ) // CarRoot prints the root CID in a car func CarRoot(c *cli.Context) (err error) { - inStream := os.Stdin - if c.Args().Len() >= 1 { - inStream, err = os.Open(c.Args().First()) - if err != nil { - return err - } - } - - rd, err := carv2.NewBlockReader(inStream) + roots, err := lib.CarRoot(c.Args().First()) if err != nil { return err } - for _, r := range rd.Roots { + for _, r := range roots { fmt.Printf("%s\n", r.String()) } diff --git a/cmd/car/verify.go b/cmd/car/verify.go index faee3aa3..52dc1fbf 100644 --- a/cmd/car/verify.go +++ b/cmd/car/verify.go @@ -2,13 +2,8 @@ package main import ( "fmt" - "io" - "os" - "github.com/ipfs/go-cid" - carv2 "github.com/ipld/go-car/v2" - "github.com/ipld/go-car/v2/index" - "github.com/multiformats/go-multihash" + "github.com/ipld/go-car/cmd/car/lib" "github.com/urfave/cli/v2" ) @@ -18,97 +13,5 @@ func VerifyCar(c *cli.Context) error { return fmt.Errorf("usage: car verify ") } - // header - rx, err := carv2.OpenReader(c.Args().First()) - if err != nil { - return err - } - defer rx.Close() - roots, err := rx.Roots() - if err != nil { - return err - } - if len(roots) == 0 { - return fmt.Errorf("no roots listed in car header") - } - rootMap := make(map[cid.Cid]struct{}) - for _, r := range roots { - rootMap[r] = struct{}{} - } - - if rx.Version == 2 { - if rx.Header.DataSize == 0 { - return fmt.Errorf("size of wrapped v1 car listed as '0'") - } - - flen, err := os.Stat(c.Args().First()) - if err != nil { - return err - } - lengthToIndex := carv2.PragmaSize + carv2.HeaderSize + rx.Header.DataSize - if uint64(flen.Size()) > lengthToIndex && rx.Header.IndexOffset == 0 { - return fmt.Errorf("header claims no index, but extra bytes in file beyond data size") - } - if rx.Header.DataOffset < carv2.PragmaSize+carv2.HeaderSize { - return fmt.Errorf("data offset places data within carv2 header") - } - if rx.Header.IndexOffset < lengthToIndex { - return fmt.Errorf("index offset overlaps with data. data ends at %d. index offset of %d", lengthToIndex, rx.Header.IndexOffset) - } - } - - // blocks - fd, err := os.Open(c.Args().First()) - if err != nil { - return err - } - rd, err := carv2.NewBlockReader(fd) - if err != nil { - return err - } - - cidList := make([]cid.Cid, 0) - for { - blk, err := rd.Next() - if err == io.EOF { - break - } - if err != nil { - return err - } - delete(rootMap, blk.Cid()) - cidList = append(cidList, blk.Cid()) - } - - if len(rootMap) > 0 { - return fmt.Errorf("header lists root(s) not present as a block: %v", rootMap) - } - - // index - if rx.Version == 2 && rx.Header.HasIndex() { - ir, err := rx.IndexReader() - if err != nil { - return err - } - idx, err := index.ReadFrom(ir) - if err != nil { - return err - } - for _, c := range cidList { - cidHash, err := multihash.Decode(c.Hash()) - if err != nil { - return err - } - if cidHash.Code == multihash.IDENTITY { - continue - } - if err := idx.GetAll(c, func(_ uint64) bool { - return true - }); err != nil { - return fmt.Errorf("could not look up known cid %s in index: %w", c, err) - } - } - } - - return nil + return lib.VerifyCar(c.Args().First()) } From 3bdf616c21774bedf3e37b475ee0a659d413f814 Mon Sep 17 00:00:00 2001 From: Will Scott Date: Wed, 5 Jun 2024 13:34:04 +0200 Subject: [PATCH 2/2] Add extract helper --- cmd/car/lib/extract.go | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cmd/car/lib/extract.go b/cmd/car/lib/extract.go index 6ab6bebc..51ac9594 100644 --- a/cmd/car/lib/extract.go +++ b/cmd/car/lib/extract.go @@ -13,6 +13,7 @@ import ( "github.com/ipfs/go-unixfsnode" "github.com/ipfs/go-unixfsnode/data" "github.com/ipfs/go-unixfsnode/file" + carstorage "github.com/ipld/go-car/v2/storage" dagpb "github.com/ipld/go-codec-dagpb" "github.com/ipld/go-ipld-prime" cidlink "github.com/ipld/go-ipld-prime/linking/cid" @@ -21,6 +22,30 @@ import ( var ErrNotDir = fmt.Errorf("not a directory") +func ExtractFromFile(c context.Context, carPath string, outputDir string, logger io.Writer) error { + carFile, err := os.Open(carPath) + if err != nil { + return err + } + store, err := carstorage.OpenReadable(carFile) + if err != nil { + return err + } + roots := store.Roots() + + ls := cidlink.DefaultLinkSystem() + ls.TrustedStorage = true + ls.SetReadStorage(store) + + for _, root := range roots { + _, err = ExtractToDir(c, &ls, root, outputDir, []string{}, false, logger) + if err != nil { + return err + } + } + return nil +} + func ExtractToDir(c context.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string, path []string, verbose bool, logger io.Writer) (int, error) { if root.Prefix().Codec == cid.Raw { if verbose {