Skip to content

Commit

Permalink
Merge pull request #221 from kevpar/sparse
Browse files Browse the repository at this point in the history
backuptar: Fix sparse file handling
  • Loading branch information
kevpar authored Oct 4, 2021
2 parents a0fd408 + 33a4801 commit 7ec9238
Show file tree
Hide file tree
Showing 2 changed files with 234 additions and 72 deletions.
74 changes: 51 additions & 23 deletions backuptar/tar.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ package backuptar
import (
"archive/tar"
"encoding/base64"
"errors"
"fmt"
"io"
"io/ioutil"
Expand Down Expand Up @@ -42,19 +41,14 @@ const (
hdrCreationTime = "LIBARCHIVE.creationtime"
)

func writeZeroes(w io.Writer, count int64) error {
buf := make([]byte, 8192)
c := len(buf)
for i := int64(0); i < count; i += int64(c) {
if int64(c) > count-i {
c = int(count - i)
}
_, err := w.Write(buf[:c])
if err != nil {
return err
}
// zeroReader is an io.Reader that always returns 0s.
type zeroReader struct{}

func (zr zeroReader) Read(b []byte) (int, error) {
for i := range b {
b[i] = 0
}
return nil
return len(b), nil
}

func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error {
Expand All @@ -71,16 +65,26 @@ func copySparse(t *tar.Writer, br *winio.BackupStreamReader) error {
return fmt.Errorf("unexpected stream %d", bhdr.Id)
}

// We can't seek backwards, since we have already written that data to the tar.Writer.
if bhdr.Offset < curOffset {
return fmt.Errorf("cannot seek back from %d to %d", curOffset, bhdr.Offset)
}
// archive/tar does not support writing sparse files
// so just write zeroes to catch up to the current offset.
err = writeZeroes(t, bhdr.Offset-curOffset)
if _, err := io.CopyN(t, zeroReader{}, bhdr.Offset-curOffset); err != nil {
return fmt.Errorf("seek to offset %d: %s", bhdr.Offset, err)
}
if bhdr.Size == 0 {
// A sparse block with size = 0 is used to mark the end of the sparse blocks.
break
}
n, err := io.Copy(t, br)
if err != nil {
return err
}
if n != bhdr.Size {
return fmt.Errorf("copied %d bytes instead of %d at offset %d", n, bhdr.Size, bhdr.Offset)
}
curOffset = bhdr.Offset + n
}
return nil
Expand Down Expand Up @@ -221,20 +225,44 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size
}
}

// The logic for copying file contents is fairly complicated due to the need for handling sparse files,
// and the weird ways they are represented by BackupRead. A normal file will always either have a data stream
// with size and content, or no data stream at all (if empty). However, for a sparse file, the content can also
// be represented using a series of sparse block streams following the data stream. Additionally, the way sparse
// files are handled by BackupRead has changed in the OS recently. The specifics of the representation are described
// in the list at the bottom of this block comment.
//
// Sparse files can be represented in four different ways, based on the specifics of the file.
// - Size = 0:
// Previously: BackupRead yields no data stream and no sparse block streams.
// Recently: BackupRead yields a data stream with size = 0. There are no following sparse block streams.
// - Size > 0, no allocated ranges:
// BackupRead yields a data stream with size = 0. Following is a single sparse block stream with
// size = 0 and offset = <file size>.
// - Size > 0, one allocated range:
// BackupRead yields a data stream with size = <file size> containing the file contents. There are no
// sparse block streams. This is the case if you take a normal file with contents and simply set the
// sparse flag on it.
// - Size > 0, multiple allocated ranges:
// BackupRead yields a data stream with size = 0. Following are sparse block streams for each allocated
// range of the file containing the range contents. Finally there is a sparse block stream with
// size = 0 and offset = <file size>.

if dataHdr != nil {
// A data stream was found. Copy the data.
if (dataHdr.Attributes & winio.StreamSparseAttributes) == 0 {
// We assume that we will either have a data stream size > 0 XOR have sparse block streams.
if dataHdr.Size > 0 || (dataHdr.Attributes&winio.StreamSparseAttributes) == 0 {
if size != dataHdr.Size {
return fmt.Errorf("%s: mismatch between file size %d and header size %d", name, size, dataHdr.Size)
}
_, err = io.Copy(t, br)
if err != nil {
return err
if _, err = io.Copy(t, br); err != nil {
return fmt.Errorf("%s: copying contents from data stream: %s", name, err)
}
} else {
err = copySparse(t, br)
if err != nil {
return err
} else if size > 0 {
// As of a recent OS change, BackupRead now returns a data stream for empty sparse files.
// These files have no sparse block streams, so skip the copySparse call if file size = 0.
if err = copySparse(t, br); err != nil {
return fmt.Errorf("%s: copying contents from sparse block stream: %s", name, err)
}
}
}
Expand Down Expand Up @@ -279,7 +307,7 @@ func WriteTarFileFromBackupStream(t *tar.Writer, r io.Reader, name string, size
} else {
// Unsupported for now, since the size of the alternate stream is not present
// in the backup stream until after the data has been read.
return errors.New("tar of sparse alternate data streams is unsupported")
return fmt.Errorf("%s: tar of sparse alternate data streams is unsupported", name)
}
case winio.BackupEaData, winio.BackupLink, winio.BackupPropertyData, winio.BackupObjectId, winio.BackupTxfsData:
// ignore these streams
Expand Down
232 changes: 183 additions & 49 deletions backuptar/tar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ package backuptar
import (
"archive/tar"
"bytes"
"io"
"io/ioutil"
"os"
"path/filepath"
"reflect"
"testing"

"github.com/Microsoft/go-winio"
"golang.org/x/sys/windows"
)

func ensurePresent(t *testing.T, m map[string]string, keys ...string) {
Expand All @@ -22,65 +24,197 @@ func ensurePresent(t *testing.T, m map[string]string, keys ...string) {
}
}

func TestRoundTrip(t *testing.T) {
f, err := ioutil.TempFile("", "tst")
if err != nil {
t.Fatal(err)
}
defer f.Close()
defer os.Remove(f.Name())

if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil {
t.Fatal(err)
}

if _, err = f.Seek(0, 0); err != nil {
func setSparse(t *testing.T, f *os.File) {
const FSCTL_SET_SPARSE uint32 = 0x900c4
if err := windows.DeviceIoControl(windows.Handle(f.Fd()), FSCTL_SET_SPARSE, nil, 0, nil, 0, nil, nil); err != nil {
t.Fatal(err)
}
}

fi, err := f.Stat()
if err != nil {
t.Fatal(err)
// compareReaders validates that two readers contain the exact same data.
func compareReaders(t *testing.T, rActual io.Reader, rExpected io.Reader) {
const size = 8 * 1024
var bufExpected, bufActual [size]byte
var readCount int64
// Loop, first reading from rExpected, then reading the same amount from rActual.
// For each set of reads, compare the bytes to make sure they are identical.
// When we run out of data in rExpected, exit the loop.
for {
// Do a read from rExpected and see how many bytes we get.
nExpected, err := rExpected.Read(bufExpected[:])
if err == io.EOF && nExpected == 0 {
break
} else if err != nil && err != io.EOF {
t.Fatalf("Failed reading from rExpected at %d: %s", readCount, err)
}
// Do a ReadFull from rActual for the same number of bytes we got from rExpected.
if nActual, err := io.ReadFull(rActual, bufActual[:nExpected]); err != nil {
t.Fatalf("Only read %d bytes out of %d from rActual at %d: %s", nActual, nExpected, readCount, err)
}
readCount += int64(nExpected)
for i, bExpected := range bufExpected[:nExpected] {
if bExpected != bufActual[i] {
t.Fatalf("Mismatched bytes at %d. got 0x%x, expected 0x%x", i, bufActual[i], bExpected)
}
}
}

bi, err := winio.GetFileBasicInfo(f)
if err != nil {
t.Fatal(err)
}

br := winio.NewBackupFileReader(f, true)
defer br.Close()

var buf bytes.Buffer
tw := tar.NewWriter(&buf)

err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi)
if err != nil {
t.Fatal(err)
// Now we just need to make sure there isn't any further data in rActual.
var b [1]byte
if n, err := rActual.Read(b[:]); n != 0 || err != io.EOF {
t.Fatalf("rActual didn't return EOF at expected end. Read %d bytes with error %s", n, err)
}
}

tr := tar.NewReader(&buf)
hdr, err := tr.Next()
if err != nil {
t.Fatal(err)
func TestRoundTrip(t *testing.T) {
// Each test case is a name mapped to a function which must create a file and return its path.
// The test then round-trips that file through backuptar, and validates the output matches the input.
for name, setup := range map[string]func(*testing.T) string{
"normalFile": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
if err := ioutil.WriteFile(path, []byte("testing 1 2 3\n"), 0644); err != nil {
t.Fatal(err)
}
return path
},
"normalFileEmpty": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
return path
},
"sparseFileEmpty": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
return path
},
"sparseFileWithNoAllocatedRanges": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
// Set file size without writing data to produce a file with size > 0
// but no allocated ranges.
if err := f.Truncate(1000000); err != nil {
t.Fatal(err)
}
return path
},
"sparseFileWithOneAllocatedRange": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
if _, err := f.WriteString("test sparse data"); err != nil {
t.Fatal(err)
}
return path
},
"sparseFileWithMultipleAllocatedRanges": func(t *testing.T) string {
path := filepath.Join(t.TempDir(), "foo.txt")
f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
t.Fatal(err)
}
defer f.Close()
setSparse(t, f)
if _, err = f.Write([]byte("testing 1 2 3\n")); err != nil {
t.Fatal(err)
}
// The documentation talks about FSCTL_SET_ZERO_DATA, but seeking also
// seems to create a hole.
if _, err = f.Seek(1000000, 0); err != nil {
t.Fatal(err)
}
if _, err = f.Write([]byte("more data later\n")); err != nil {
t.Fatal(err)
}
return path
},
} {
t.Run(name, func(t *testing.T) {
path := setup(t)
f, err := os.Open(path)
if err != nil {
t.Fatal(err)
}
defer f.Close()

fi, err := f.Stat()
if err != nil {
t.Fatal(err)
}
bi, err := winio.GetFileBasicInfo(f)
if err != nil {
t.Fatal(err)
}

br := winio.NewBackupFileReader(f, true)
defer br.Close()
var buf bytes.Buffer
tw := tar.NewWriter(&buf)
err = WriteTarFileFromBackupStream(tw, br, f.Name(), fi.Size(), bi)
if err != nil {
t.Fatal(err)
}
tr := tar.NewReader(&buf)
hdr, err := tr.Next()
if err != nil {
t.Fatal(err)
}

name, size, bi2, err := FileInfoFromHeader(hdr)
if err != nil {
t.Fatal(err)
}
if name != filepath.ToSlash(f.Name()) {
t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name()))
}
if size != fi.Size() {
t.Errorf("got size %d, expected %d", size, fi.Size())
}
if !reflect.DeepEqual(*bi2, *bi) {
t.Errorf("got %#v, expected %#v", *bi2, *bi)
}
ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd")
// Reset file position so we can compare file contents.
// The file contents of the actual file should match what we get from the tar.
if _, err := f.Seek(0, 0); err != nil {
t.Fatal(err)
}
compareReaders(t, tr, f)
})
}
}

name, size, bi2, err := FileInfoFromHeader(hdr)
func TestZeroReader(t *testing.T) {
const size = 512
var b [size]byte
var bExpected [size]byte
var r zeroReader
n, err := r.Read(b[:])
if err != nil {
t.Fatal(err)
}

if name != filepath.ToSlash(f.Name()) {
t.Errorf("got name %s, expected %s", name, filepath.ToSlash(f.Name()))
t.Fatalf("Unexpected read error: %s", err)
}

if size != fi.Size() {
t.Errorf("got size %d, expected %d", size, fi.Size())
if n != size {
t.Errorf("Wrong read size. got %d, expected %d", n, size)
}

if !reflect.DeepEqual(*bi2, *bi) {
t.Errorf("got %#v, expected %#v", *bi2, *bi)
for i := range b {
if b[i] != bExpected[i] {
t.Errorf("Wrong content at index %d. got %d, expected %d", i, b[i], bExpected[i])
}
}

ensurePresent(t, hdr.PAXRecords, "MSWINDOWS.fileattr", "MSWINDOWS.rawsd")
}

0 comments on commit 7ec9238

Please sign in to comment.