From 544fc501fab905aa6b675101ad69e2a2d41de941 Mon Sep 17 00:00:00 2001 From: Matt Dainty Date: Tue, 30 Jan 2024 09:32:10 +0000 Subject: [PATCH] feat: Export the folder/stream identifier Useful as a parallelisation hint. --- README.md | 12 +++- go.mod | 1 + go.sum | 2 + reader.go | 3 + reader_test.go | 166 ++++++++++++++++++++++++++++++++++++++----------- struct.go | 10 ++- 6 files changed, 152 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 6072a54..2132517 100644 --- a/README.md +++ b/README.md @@ -98,11 +98,17 @@ goarch: amd64 pkg: github.com/bodgit/sevenzip cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz BenchmarkNaiveReader -BenchmarkNaiveReader-12 2 33883477004 ns/op +BenchmarkNaiveReader-12 2 31077542628 ns/op BenchmarkOptimisedReader -BenchmarkOptimisedReader-12 402 180606463 ns/op +BenchmarkOptimisedReader-12 434 164854747 ns/op +BenchmarkNaiveParallelReader +BenchmarkNaiveParallelReader-12 240 361869339 ns/op +BenchmarkNaiveSingleParallelReader +BenchmarkNaiveSingleParallelReader-12 412 171027895 ns/op +BenchmarkParallelReader +BenchmarkParallelReader-12 636 112551812 ns/op PASS -ok github.com/bodgit/sevenzip 191.827s +ok github.com/bodgit/sevenzip 472.251s ``` The archive used here is just the reference LZMA SDK archive, which is only 1 MiB in size but does contain 630+ files. The only difference between the two benchmarks is the above change to call `rc.Close()` between files so the stream reuse optimisation takes effect. diff --git a/go.mod b/go.mod index 6db48fb..4057776 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/stretchr/testify v1.8.4 github.com/ulikunitz/xz v0.5.11 go4.org v0.0.0-20200411211856-f5505b9728dd + golang.org/x/sync v0.6.0 golang.org/x/text v0.14.0 ) diff --git a/go.sum b/go.sum index f4f165a..553f2e5 100644 --- a/go.sum +++ b/go.sum @@ -155,6 +155,8 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/reader.go b/reader.go index 87e6473..8f586dd 100644 --- a/reader.go +++ b/reader.go @@ -447,6 +447,9 @@ func (z *Reader) init(r io.ReaderAt, size int64) error { if !fh.isEmptyStream && !fh.isEmptyFile { f.folder, _ = header.streamsInfo.FileFolderAndSize(j) + // Make an exported copy of the folder index + f.Stream = f.folder + if f.folder != folder { offset = 0 } diff --git a/reader_test.go b/reader_test.go index 984bcf3..8ba9436 100644 --- a/reader_test.go +++ b/reader_test.go @@ -3,9 +3,11 @@ package sevenzip_test import ( "errors" "fmt" + "hash" "hash/crc32" "io" "path/filepath" + "runtime" "testing" "testing/fstest" "testing/iotest" @@ -13,38 +15,60 @@ import ( "github.com/bodgit/sevenzip" "github.com/bodgit/sevenzip/internal/util" "github.com/stretchr/testify/assert" + "golang.org/x/sync/errgroup" ) -func readArchive(t *testing.T, r *sevenzip.ReadCloser) { - t.Helper() +func reader(r io.Reader) io.Reader { + return r +} - h := crc32.NewIEEE() +func extractFile(tb testing.TB, r io.Reader, h hash.Hash, f *sevenzip.File) error { + tb.Helper() - for _, f := range r.File { - rc, err := f.Open() - if err != nil { - t.Fatal(err) - } - defer rc.Close() + h.Reset() - h.Reset() + if _, err := io.Copy(h, r); err != nil { + return err + } - if _, err := io.Copy(h, iotest.OneByteReader(rc)); err != nil { - t.Fatal(err) - } + if f.UncompressedSize > 0 && f.CRC32 == 0 { + tb.Log("archive member", f.Name, "has no CRC") + + return nil + } - rc.Close() + if !util.CRC32Equal(h.Sum(nil), f.CRC32) { + return errors.New("CRC doesn't match") + } + + return nil +} - if f.UncompressedSize > 0 && f.CRC32 == 0 { - t.Log("archive member", f.Name, "has no CRC") +//nolint:lll +func extractArchive(tb testing.TB, r *sevenzip.ReadCloser, stream int, h hash.Hash, fn func(io.Reader) io.Reader, optimised bool) error { + tb.Helper() + for _, f := range r.File { + if stream >= 0 && f.Stream != stream { continue } - if !util.CRC32Equal(h.Sum(nil), f.CRC32) { - t.Fatal(errors.New("CRC doesn't match")) + rc, err := f.Open() + if err != nil { + return err + } + defer rc.Close() + + if err = extractFile(tb, fn(rc), h, f); err != nil { + return err + } + + if optimised { + rc.Close() } } + + return nil } //nolint:funlen @@ -184,7 +208,9 @@ func TestOpenReader(t *testing.T) { assert.Equal(t, volumes, r.Volumes()) - readArchive(t, r) + if err := extractArchive(t, r, -1, crc32.NewIEEE(), iotest.OneByteReader, true); err != nil { + t.Fatal(err) + } }) } } @@ -223,7 +249,9 @@ func TestOpenReaderWithPassword(t *testing.T) { } defer r.Close() - readArchive(t, r) + if err := extractArchive(t, r, -1, crc32.NewIEEE(), iotest.OneByteReader, true); err != nil { + t.Fatal(err) + } }) } } @@ -264,10 +292,43 @@ func ExampleOpenReader() { // 10 } -func benchmarkArchive(b *testing.B, file string, optimised bool) { +func benchmarkArchiveParallel(b *testing.B, file string) { b.Helper() - h := crc32.NewIEEE() + for n := 0; n < b.N; n++ { + r, err := sevenzip.OpenReader(filepath.Join("testdata", file)) + if err != nil { + b.Fatal(err) + } + defer r.Close() + + streams := make(map[int]struct{}, len(r.File)) + + for _, f := range r.File { + streams[f.Stream] = struct{}{} + } + + eg := new(errgroup.Group) + eg.SetLimit(runtime.NumCPU()) + + for stream := range streams { + stream := stream + + eg.Go(func() error { + return extractArchive(b, r, stream, crc32.NewIEEE(), reader, true) + }) + } + + if err := eg.Wait(); err != nil { + b.Fatal(err) + } + + r.Close() + } +} + +func benchmarkArchiveNaiveParallel(b *testing.B, file string, workers int) { + b.Helper() for n := 0; n < b.N; n++ { r, err := sevenzip.OpenReader(filepath.Join("testdata", file)) @@ -276,26 +337,45 @@ func benchmarkArchive(b *testing.B, file string, optimised bool) { } defer r.Close() + eg := new(errgroup.Group) + eg.SetLimit(workers) + for _, f := range r.File { - rc, err := f.Open() - if err != nil { - b.Fatal(err) - } - defer rc.Close() + f := f - h.Reset() + eg.Go(func() error { + rc, err := f.Open() + if err != nil { + return err + } + defer rc.Close() - if _, err := io.Copy(h, rc); err != nil { - b.Fatal(err) - } + return extractFile(b, rc, crc32.NewIEEE(), f) + }) + } - if optimised { - rc.Close() - } + if err := eg.Wait(); err != nil { + b.Fatal(err) + } - if !util.CRC32Equal(h.Sum(nil), f.CRC32) { - b.Fatal(errors.New("CRC doesn't match")) - } + r.Close() + } +} + +func benchmarkArchive(b *testing.B, file string, optimised bool) { + b.Helper() + + h := crc32.NewIEEE() + + for n := 0; n < b.N; n++ { + r, err := sevenzip.OpenReader(filepath.Join("testdata", file)) + if err != nil { + b.Fatal(err) + } + defer r.Close() + + if err := extractArchive(b, r, -1, h, reader, optimised); err != nil { + b.Fatal(err) } r.Close() @@ -354,6 +434,18 @@ func BenchmarkOptimisedReader(b *testing.B) { benchmarkArchive(b, "lzma1900.7z", true) } +func BenchmarkNaiveParallelReader(b *testing.B) { + benchmarkArchiveNaiveParallel(b, "lzma1900.7z", runtime.NumCPU()) +} + +func BenchmarkNaiveSingleParallelReader(b *testing.B) { + benchmarkArchiveNaiveParallel(b, "lzma1900.7z", 1) +} + +func BenchmarkParallelReader(b *testing.B) { + benchmarkArchiveParallel(b, "lzma1900.7z") +} + func BenchmarkBCJ(b *testing.B) { benchmarkArchive(b, "bcj.7z", true) } diff --git a/struct.go b/struct.go index f95a6ea..f87927c 100644 --- a/struct.go +++ b/struct.go @@ -324,8 +324,14 @@ type FileHeader struct { Attributes uint32 CRC32 uint32 UncompressedSize uint64 - isEmptyStream bool - isEmptyFile bool + + // Stream is an opaque identifier representing the compressed stream + // that contains the file. Any File with the same value can be assumed + // to be stored within the same stream. + Stream int + + isEmptyStream bool + isEmptyFile bool } // FileInfo returns an fs.FileInfo for the FileHeader.