From 8e3f096bc5805c58c9049c18d20f0b89cbf72db6 Mon Sep 17 00:00:00 2001 From: Scott O'Neil Date: Wed, 24 Jan 2024 14:26:04 -0800 Subject: [PATCH 1/2] yar --- src/Sarif/ZipArchiveArtifact.cs | 25 +++++++++++++------ .../ArtifactProviderTests.cs | 16 +++++++----- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/Sarif/ZipArchiveArtifact.cs b/src/Sarif/ZipArchiveArtifact.cs index 26b9da28e..fb39a4d54 100644 --- a/src/Sarif/ZipArchiveArtifact.cs +++ b/src/Sarif/ZipArchiveArtifact.cs @@ -34,8 +34,8 @@ public bool IsBinary { get { - string extension = Path.GetExtension(Uri.ToString()); - return this.binaryExtensions.Contains(extension); + GetArtifactData(); + return this.bytes != null; } } @@ -91,8 +91,20 @@ public byte[] Bytes { if (this.contents == null && this.bytes == null) { - string extension = Path.GetExtension(Uri.ToString()); - if (this.binaryExtensions.Contains(extension)) + const int PeekWindowBytes = 1024; + var peekable = new PeekableStream(this.Stream, PeekWindowBytes); + + byte[] header = new byte[PeekWindowBytes]; + int length = this.Stream.Read(header, 0, header.Length); + bool isText = FileEncoding.IsTextualData(header, 0, length); + + peekable.Rewind(); + + if (isText) + { + this.contents = new StreamReader(Stream).ReadToEnd(); + } + else { // The underlying System.IO.Compression.DeflateStream throws on reads to get_Length. using var ms = new MemoryStream((int)SizeInBytes.Value); @@ -113,12 +125,9 @@ public byte[] Bytes ms.Read(this.bytes, 0, this.bytes.Length); } } - else - { - this.contents = new StreamReader(Stream).ReadToEnd(); - } } } + this.entry = null; } diff --git a/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs b/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs index 0916c47fc..b8faa0bd5 100644 --- a/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs +++ b/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs @@ -6,7 +6,6 @@ using System.IO; using System.IO.Compression; using System.Linq; -using System.Text; using FluentAssertions; @@ -32,14 +31,19 @@ public void MultithreadedZipArchiveArtifactProvider_RetrieveSizeInBytesBeforeRet [Fact] public void MultithreadedZipArchiveArtifactProvider_RetrieveSizeInBytesBeforeRetrievingBytes() { - string entryContents = $"{Guid.NewGuid()}"; + string filePath = this.GetType().Assembly.Location; + using FileStream reader = File.OpenRead(filePath); + + int headerSize = 1024; + byte[] data = new byte[headerSize]; + reader.Read(data, 0, data.Length); - // Note that even thought we populate an archive with text contents, the extension - // of the archive entry indicates a binary file. So we expect binary data on expansion. - ZipArchive zip = CreateZipArchiveWithTextContents("test.exe", entryContents); + // Note that even thought we populate an archive with binary contents, the extension + // of the archive entry indicates a text file. We still expect binary data on expansion. + ZipArchive zip = CreateZipArchiveWithBinaryContents("test.txt", data); var artifactProvider = new MultithreadedZipArchiveArtifactProvider(zip, FileSystem.Instance); - ValidateBinaryContents(artifactProvider.Artifacts, Encoding.UTF8.GetBytes(entryContents)); + ValidateBinaryContents(artifactProvider.Artifacts, data); } [Fact] From c4de01e54d81d139f9233b9be2da221b9aa53d8f Mon Sep 17 00:00:00 2001 From: Scott O'Neil Date: Wed, 31 Jan 2024 16:23:06 -0800 Subject: [PATCH 2/2] release notes --- ReleaseHistory.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ReleaseHistory.md b/ReleaseHistory.md index 49815dd3f..be6b1c9e1 100644 --- a/ReleaseHistory.md +++ b/ReleaseHistory.md @@ -8,6 +8,7 @@ * BUG: Update `Stack.Create` method to populate missing `PhysicalLocation` instances when stack frames reference relative file paths. * BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`. * NEW: Add `IsBinary` property to `IEnumeratedArtifact` and implement the property in `ZipArchiveArtifact`. +* NEW: Switch to content-based `IsBinary` categorization for `ZipArchiveArtifact`s. * PRF: Change default `max-file-size-in-kb` parameter to 10 megabytes. * PRF: Add support for efficiently peeking into non-seekable streams for binary/text categorization. * NEW: Add a new `--timeout-in-seconds` parameter to `AnalyzeOptionsBase`, which will override the `TimeoutInMilliseconds` property in `AnalyzeContextBase`.