From 8f65e8f75478232b69fa2dd34f550af96dd2e8e2 Mon Sep 17 00:00:00 2001 From: Michael McCracken Date: Mon, 16 Oct 2023 17:00:47 -0700 Subject: [PATCH] mutate: add uncompressed blob size annotation Sometimes it is useful to have a reliable estimate of the space that would be taken up by a layer when it is unpacked. Since gzip's headers are only accurate up to 4GiB, larger layers can not be easily measured without uncompressing them. This commit adds a field to the gzip and zstd compressors to store the bytes read from the stream during compression and uses that to set an annotation on newly generated layers. If the noop compressor is used, the annotation is not added, as the existing "compressed" layer size would be the same. Signed-off-by: Michael McCracken --- mutate/compress.go | 40 ++++++++++++++++++++++++++++++++-------- mutate/mutate.go | 10 ++++++++++ mutate/mutate_test.go | 15 ++++++++++++--- test/create.bats | 4 ++-- 4 files changed, 56 insertions(+), 13 deletions(-) diff --git a/mutate/compress.go b/mutate/compress.go index 740968c17..175de5dbe 100644 --- a/mutate/compress.go +++ b/mutate/compress.go @@ -22,6 +22,10 @@ type Compressor interface { // indicate what compression type is used, e.g. "gzip", or "" for no // compression. MediaTypeSuffix() string + + // BytesRead returns the number of bytes read from the uncompressed input + // stream at the current time, no guarantee of completion. + BytesRead() int64 } type noopCompressor struct{} @@ -34,15 +38,21 @@ func (nc noopCompressor) MediaTypeSuffix() string { return "" } +func (nc noopCompressor) BytesRead() int64 { + return -1 +} + // NoopCompressor provides no compression. var NoopCompressor Compressor = noopCompressor{} // GzipCompressor provides gzip compression. -var GzipCompressor Compressor = gzipCompressor{} +var GzipCompressor Compressor = &gzipCompressor{} -type gzipCompressor struct{} +type gzipCompressor struct { + bytesRead int64 +} -func (gz gzipCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { +func (gz *gzipCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { pipeReader, pipeWriter := io.Pipe() gzw := gzip.NewWriter(pipeWriter) @@ -50,12 +60,14 @@ func (gz gzipCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { return nil, errors.Wrapf(err, "set concurrency level to %v blocks", 2*runtime.NumCPU()) } go func() { - if _, err := system.Copy(gzw, reader); err != nil { + bytesRead, err := system.Copy(gzw, reader) + if err != nil { log.Warnf("gzip compress: could not compress layer: %v", err) // #nosec G104 _ = pipeWriter.CloseWithError(errors.Wrap(err, "compressing layer")) return } + gz.bytesRead = bytesRead if err := gzw.Close(); err != nil { log.Warnf("gzip compress: could not close gzip writer: %v", err) // #nosec G104 @@ -76,12 +88,18 @@ func (gz gzipCompressor) MediaTypeSuffix() string { return "gzip" } +func (gz gzipCompressor) BytesRead() int64 { + return gz.bytesRead +} + // ZstdCompressor provides zstd compression. -var ZstdCompressor Compressor = zstdCompressor{} +var ZstdCompressor Compressor = &zstdCompressor{} -type zstdCompressor struct{} +type zstdCompressor struct { + bytesRead int64 +} -func (zs zstdCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { +func (zs *zstdCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { pipeReader, pipeWriter := io.Pipe() zw, err := zstd.NewWriter(pipeWriter) @@ -89,12 +107,14 @@ func (zs zstdCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { return nil, err } go func() { - if _, err := system.Copy(zw, reader); err != nil { + bytesRead, err := system.Copy(zw, reader) + if err != nil { log.Warnf("zstd compress: could not compress layer: %v", err) // #nosec G104 _ = pipeWriter.CloseWithError(errors.Wrap(err, "compressing layer")) return } + zs.bytesRead = bytesRead if err := zw.Close(); err != nil { log.Warnf("zstd compress: could not close gzip writer: %v", err) // #nosec G104 @@ -114,3 +134,7 @@ func (zs zstdCompressor) Compress(reader io.Reader) (io.ReadCloser, error) { func (zs zstdCompressor) MediaTypeSuffix() string { return "zstd" } + +func (zs zstdCompressor) BytesRead() int64 { + return zs.bytesRead +} diff --git a/mutate/mutate.go b/mutate/mutate.go index 7f93b2922..f0c8f5f05 100644 --- a/mutate/mutate.go +++ b/mutate/mutate.go @@ -24,6 +24,7 @@ package mutate import ( "context" + "fmt" "io" "reflect" "time" @@ -36,6 +37,8 @@ import ( "github.com/pkg/errors" ) +const UmociUncompressedBlobSizeAnnotation = "ci.umo.uncompressed_blob_size" + func configPtr(c ispec.Image) *ispec.Image { return &c } func manifestPtr(m ispec.Manifest) *ispec.Manifest { return &m } func timePtr(t time.Time) *time.Time { return &t } @@ -295,6 +298,13 @@ func (m *Mutator) Add(ctx context.Context, mediaType string, r io.Reader, histor compressedMediaType = compressedMediaType + "+" + compressor.MediaTypeSuffix() } + if annotations == nil { + annotations = make(map[string]string) + } + if compressor.BytesRead() >= 0 { + annotations[UmociUncompressedBlobSizeAnnotation] = fmt.Sprintf("%d", compressor.BytesRead()) + } + // Append to layers. desc = ispec.Descriptor{ MediaType: compressedMediaType, diff --git a/mutate/mutate_test.go b/mutate/mutate_test.go index 4dd859d5b..3fc488435 100644 --- a/mutate/mutate_test.go +++ b/mutate/mutate_test.go @@ -214,6 +214,7 @@ func TestMutateAdd(t *testing.T) { // This isn't a valid image, but whatever. buffer := bytes.NewBufferString("contents") + bufferSize := buffer.Len() // Add a new layer. annotations := map[string]string{"hello": "world"} @@ -253,10 +254,18 @@ func TestMutateAdd(t *testing.T) { if mutator.manifest.Layers[1].Digest == expectedLayerDigest { t.Errorf("manifest.Layers[1].Digest is not the same!") } - if len(mutator.manifest.Layers[1].Annotations) != 1 || mutator.manifest.Layers[1].Annotations["hello"] != "world" { - t.Errorf("manifest.Layers[1].Annotations was not set correctly!") + if len(mutator.manifest.Layers[1].Annotations) != 2 { + t.Errorf("manifest.Layers[1].Annotations was not set correctly!: %+v", mutator.manifest.Layers[1].Annotations) + } + if mutator.manifest.Layers[1].Annotations["hello"] != "world" { + t.Errorf("manifest.Layers[1].Annotations['hello'] was not set correctly!: %+v", mutator.manifest.Layers[1].Annotations) + } + if mutator.manifest.Layers[1].Annotations[UmociUncompressedBlobSizeAnnotation] != fmt.Sprintf("%d", bufferSize) { + t.Errorf("manifest.Layers[1].Annotations['%s'] was not set correctly!: %q, expected %d", + UmociUncompressedBlobSizeAnnotation, + mutator.manifest.Layers[1].Annotations[UmociUncompressedBlobSizeAnnotation], + bufferSize) } - if mutator.manifest.Layers[1].Digest != newLayerDesc.Digest { t.Fatalf("unexpected digest for new layer: %v %v", mutator.manifest.Layers[1].Digest, newLayerDesc.Digest) } diff --git a/test/create.bats b/test/create.bats index ae7f783db..42ac0de5e 100644 --- a/test/create.bats +++ b/test/create.bats @@ -277,8 +277,8 @@ function teardown() { # Verify that the hashes of the blobs and index match (blobs are # content-addressable so using hashes is a bit silly, but whatever). known_hashes=( - "6f3716b8ae2bb4b3fd0a851f5a553946ea351ed5fdda0bd708d325363c0487f7 $IMAGE/index.json" - "73ecb862d0ebee78acdf8553d34d1ae5c13df509030ac262e561d096bb480dfc $IMAGE/blobs/sha256/73ecb862d0ebee78acdf8553d34d1ae5c13df509030ac262e561d096bb480dfc" + "b780d08bfed4850ab807b7c308682fae6868ed9c09c0c842063c418ebe2a19fb $IMAGE/index.json" + "27bcd3b4f31740cc087346382aaba3fe1c01872d75ead1bd2b9f7053d2bb3231 $IMAGE/blobs/sha256/27bcd3b4f31740cc087346382aaba3fe1c01872d75ead1bd2b9f7053d2bb3231" "e7013826daf8b5d68f82c5b790ca5e9de222a834f2cb3fe3532030161bd72083 $IMAGE/blobs/sha256/e7013826daf8b5d68f82c5b790ca5e9de222a834f2cb3fe3532030161bd72083" "f4a39a97d97aa834da7ad2d92940f9636a57e3d9b3cc7c53242451b02a6cea89 $IMAGE/blobs/sha256/f4a39a97d97aa834da7ad2d92940f9636a57e3d9b3cc7c53242451b02a6cea89" )