diff --git a/ReleaseHistory.md b/ReleaseHistory.md index 2a0210306..e34c4c66d 100644 --- a/ReleaseHistory.md +++ b/ReleaseHistory.md @@ -10,6 +10,7 @@ * BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`. * BUG: Fix `MultithreadedAnalyzeCommandBase` to return rich return code with the `--rich-return-code` option. * NEW: Add `IsBinary` property to `IEnumeratedArtifact` and implement the property in `ZipArchiveArtifact`. +* NEW: Switch to content-based `IsBinary` categorization for `ZipArchiveArtifact`s. * PRF: Change default `max-file-size-in-kb` parameter to 10 megabytes. * PRF: Add support for efficiently peeking into non-seekable streams for binary/text categorization. * NEW: Add a new `--timeout-in-seconds` parameter to `AnalyzeOptionsBase`, which will override the `TimeoutInMilliseconds` property in `AnalyzeContextBase`. diff --git a/src/Sarif/ZipArchiveArtifact.cs b/src/Sarif/ZipArchiveArtifact.cs index 26b9da28e..fb39a4d54 100644 --- a/src/Sarif/ZipArchiveArtifact.cs +++ b/src/Sarif/ZipArchiveArtifact.cs @@ -34,8 +34,8 @@ public bool IsBinary { get { - string extension = Path.GetExtension(Uri.ToString()); - return this.binaryExtensions.Contains(extension); + GetArtifactData(); + return this.bytes != null; } } @@ -91,8 +91,20 @@ public byte[] Bytes { if (this.contents == null && this.bytes == null) { - string extension = Path.GetExtension(Uri.ToString()); - if (this.binaryExtensions.Contains(extension)) + const int PeekWindowBytes = 1024; + var peekable = new PeekableStream(this.Stream, PeekWindowBytes); + + byte[] header = new byte[PeekWindowBytes]; + int length = this.Stream.Read(header, 0, header.Length); + bool isText = FileEncoding.IsTextualData(header, 0, length); + + peekable.Rewind(); + + if (isText) + { + this.contents = new StreamReader(Stream).ReadToEnd(); + } + else { // The underlying System.IO.Compression.DeflateStream throws on reads to get_Length. using var ms = new MemoryStream((int)SizeInBytes.Value); @@ -113,12 +125,9 @@ public byte[] Bytes ms.Read(this.bytes, 0, this.bytes.Length); } } - else - { - this.contents = new StreamReader(Stream).ReadToEnd(); - } } } + this.entry = null; } diff --git a/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs b/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs index 0916c47fc..b8faa0bd5 100644 --- a/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs +++ b/src/Test.UnitTests.Sarif/ArtifactProviderTests.cs @@ -6,7 +6,6 @@ using System.IO; using System.IO.Compression; using System.Linq; -using System.Text; using FluentAssertions; @@ -32,14 +31,19 @@ public void MultithreadedZipArchiveArtifactProvider_RetrieveSizeInBytesBeforeRet [Fact] public void MultithreadedZipArchiveArtifactProvider_RetrieveSizeInBytesBeforeRetrievingBytes() { - string entryContents = $"{Guid.NewGuid()}"; + string filePath = this.GetType().Assembly.Location; + using FileStream reader = File.OpenRead(filePath); + + int headerSize = 1024; + byte[] data = new byte[headerSize]; + reader.Read(data, 0, data.Length); - // Note that even thought we populate an archive with text contents, the extension - // of the archive entry indicates a binary file. So we expect binary data on expansion. - ZipArchive zip = CreateZipArchiveWithTextContents("test.exe", entryContents); + // Note that even thought we populate an archive with binary contents, the extension + // of the archive entry indicates a text file. We still expect binary data on expansion. + ZipArchive zip = CreateZipArchiveWithBinaryContents("test.txt", data); var artifactProvider = new MultithreadedZipArchiveArtifactProvider(zip, FileSystem.Instance); - ValidateBinaryContents(artifactProvider.Artifacts, Encoding.UTF8.GetBytes(entryContents)); + ValidateBinaryContents(artifactProvider.Artifacts, data); } [Fact]