Skip to content

Commit

Permalink
feat: Export the folder/stream identifier
Browse files Browse the repository at this point in the history
Useful as a parallelisation hint.
  • Loading branch information
bodgit committed Feb 7, 2024
1 parent 9e0ded8 commit 544fc50
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 42 deletions.
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,17 @@ goarch: amd64
pkg: github.com/bodgit/sevenzip
cpu: Intel(R) Core(TM) i9-8950HK CPU @ 2.90GHz
BenchmarkNaiveReader
BenchmarkNaiveReader-12 2 33883477004 ns/op
BenchmarkNaiveReader-12 2 31077542628 ns/op
BenchmarkOptimisedReader
BenchmarkOptimisedReader-12 402 180606463 ns/op
BenchmarkOptimisedReader-12 434 164854747 ns/op
BenchmarkNaiveParallelReader
BenchmarkNaiveParallelReader-12 240 361869339 ns/op
BenchmarkNaiveSingleParallelReader
BenchmarkNaiveSingleParallelReader-12 412 171027895 ns/op
BenchmarkParallelReader
BenchmarkParallelReader-12 636 112551812 ns/op
PASS
ok github.com/bodgit/sevenzip 191.827s
ok github.com/bodgit/sevenzip 472.251s
```
The archive used here is just the reference LZMA SDK archive, which is only 1 MiB in size but does contain 630+ files.
The only difference between the two benchmarks is the above change to call `rc.Close()` between files so the stream reuse optimisation takes effect.
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/stretchr/testify v1.8.4
github.com/ulikunitz/xz v0.5.11
go4.org v0.0.0-20200411211856-f5505b9728dd
golang.org/x/sync v0.6.0
golang.org/x/text v0.14.0
)

Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ=
golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
Expand Down
3 changes: 3 additions & 0 deletions reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,9 @@ func (z *Reader) init(r io.ReaderAt, size int64) error {
if !fh.isEmptyStream && !fh.isEmptyFile {
f.folder, _ = header.streamsInfo.FileFolderAndSize(j)

// Make an exported copy of the folder index
f.Stream = f.folder

if f.folder != folder {
offset = 0
}
Expand Down
166 changes: 129 additions & 37 deletions reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,48 +3,72 @@ package sevenzip_test
import (
"errors"
"fmt"
"hash"
"hash/crc32"
"io"
"path/filepath"
"runtime"
"testing"
"testing/fstest"
"testing/iotest"

"github.com/bodgit/sevenzip"
"github.com/bodgit/sevenzip/internal/util"
"github.com/stretchr/testify/assert"
"golang.org/x/sync/errgroup"
)

func readArchive(t *testing.T, r *sevenzip.ReadCloser) {
t.Helper()
func reader(r io.Reader) io.Reader {
return r
}

h := crc32.NewIEEE()
func extractFile(tb testing.TB, r io.Reader, h hash.Hash, f *sevenzip.File) error {
tb.Helper()

for _, f := range r.File {
rc, err := f.Open()
if err != nil {
t.Fatal(err)
}
defer rc.Close()
h.Reset()

h.Reset()
if _, err := io.Copy(h, r); err != nil {
return err
}

if _, err := io.Copy(h, iotest.OneByteReader(rc)); err != nil {
t.Fatal(err)
}
if f.UncompressedSize > 0 && f.CRC32 == 0 {
tb.Log("archive member", f.Name, "has no CRC")

return nil
}

rc.Close()
if !util.CRC32Equal(h.Sum(nil), f.CRC32) {
return errors.New("CRC doesn't match")
}

return nil
}

if f.UncompressedSize > 0 && f.CRC32 == 0 {
t.Log("archive member", f.Name, "has no CRC")
//nolint:lll
func extractArchive(tb testing.TB, r *sevenzip.ReadCloser, stream int, h hash.Hash, fn func(io.Reader) io.Reader, optimised bool) error {
tb.Helper()

for _, f := range r.File {
if stream >= 0 && f.Stream != stream {
continue
}

if !util.CRC32Equal(h.Sum(nil), f.CRC32) {
t.Fatal(errors.New("CRC doesn't match"))
rc, err := f.Open()
if err != nil {
return err
}
defer rc.Close()

if err = extractFile(tb, fn(rc), h, f); err != nil {
return err
}

if optimised {
rc.Close()
}
}

return nil
}

//nolint:funlen
Expand Down Expand Up @@ -184,7 +208,9 @@ func TestOpenReader(t *testing.T) {

assert.Equal(t, volumes, r.Volumes())

readArchive(t, r)
if err := extractArchive(t, r, -1, crc32.NewIEEE(), iotest.OneByteReader, true); err != nil {
t.Fatal(err)
}
})
}
}
Expand Down Expand Up @@ -223,7 +249,9 @@ func TestOpenReaderWithPassword(t *testing.T) {
}
defer r.Close()

readArchive(t, r)
if err := extractArchive(t, r, -1, crc32.NewIEEE(), iotest.OneByteReader, true); err != nil {
t.Fatal(err)
}
})
}
}
Expand Down Expand Up @@ -264,10 +292,43 @@ func ExampleOpenReader() {
// 10
}

func benchmarkArchive(b *testing.B, file string, optimised bool) {
func benchmarkArchiveParallel(b *testing.B, file string) {
b.Helper()

h := crc32.NewIEEE()
for n := 0; n < b.N; n++ {
r, err := sevenzip.OpenReader(filepath.Join("testdata", file))
if err != nil {
b.Fatal(err)
}
defer r.Close()

streams := make(map[int]struct{}, len(r.File))

for _, f := range r.File {
streams[f.Stream] = struct{}{}
}

eg := new(errgroup.Group)
eg.SetLimit(runtime.NumCPU())

for stream := range streams {
stream := stream

eg.Go(func() error {
return extractArchive(b, r, stream, crc32.NewIEEE(), reader, true)
})
}

if err := eg.Wait(); err != nil {
b.Fatal(err)
}

r.Close()
}
}

func benchmarkArchiveNaiveParallel(b *testing.B, file string, workers int) {
b.Helper()

for n := 0; n < b.N; n++ {
r, err := sevenzip.OpenReader(filepath.Join("testdata", file))
Expand All @@ -276,26 +337,45 @@ func benchmarkArchive(b *testing.B, file string, optimised bool) {
}
defer r.Close()

eg := new(errgroup.Group)
eg.SetLimit(workers)

for _, f := range r.File {
rc, err := f.Open()
if err != nil {
b.Fatal(err)
}
defer rc.Close()
f := f

h.Reset()
eg.Go(func() error {
rc, err := f.Open()
if err != nil {
return err
}
defer rc.Close()

if _, err := io.Copy(h, rc); err != nil {
b.Fatal(err)
}
return extractFile(b, rc, crc32.NewIEEE(), f)
})
}

if optimised {
rc.Close()
}
if err := eg.Wait(); err != nil {
b.Fatal(err)
}

if !util.CRC32Equal(h.Sum(nil), f.CRC32) {
b.Fatal(errors.New("CRC doesn't match"))
}
r.Close()
}
}

func benchmarkArchive(b *testing.B, file string, optimised bool) {
b.Helper()

h := crc32.NewIEEE()

for n := 0; n < b.N; n++ {
r, err := sevenzip.OpenReader(filepath.Join("testdata", file))
if err != nil {
b.Fatal(err)
}
defer r.Close()

if err := extractArchive(b, r, -1, h, reader, optimised); err != nil {
b.Fatal(err)
}

r.Close()
Expand Down Expand Up @@ -354,6 +434,18 @@ func BenchmarkOptimisedReader(b *testing.B) {
benchmarkArchive(b, "lzma1900.7z", true)
}

func BenchmarkNaiveParallelReader(b *testing.B) {
benchmarkArchiveNaiveParallel(b, "lzma1900.7z", runtime.NumCPU())
}

func BenchmarkNaiveSingleParallelReader(b *testing.B) {
benchmarkArchiveNaiveParallel(b, "lzma1900.7z", 1)
}

func BenchmarkParallelReader(b *testing.B) {
benchmarkArchiveParallel(b, "lzma1900.7z")
}

func BenchmarkBCJ(b *testing.B) {
benchmarkArchive(b, "bcj.7z", true)
}
Expand Down
10 changes: 8 additions & 2 deletions struct.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,14 @@ type FileHeader struct {
Attributes uint32
CRC32 uint32
UncompressedSize uint64
isEmptyStream bool
isEmptyFile bool

// Stream is an opaque identifier representing the compressed stream
// that contains the file. Any File with the same value can be assumed
// to be stored within the same stream.
Stream int

isEmptyStream bool
isEmptyFile bool
}

// FileInfo returns an fs.FileInfo for the FileHeader.
Expand Down

0 comments on commit 544fc50

Please sign in to comment.