diff --git a/ReleaseHistory.md b/ReleaseHistory.md index 991ae3374..2a0210306 100644 --- a/ReleaseHistory.md +++ b/ReleaseHistory.md @@ -5,6 +5,7 @@ * DEP: Remove spurious references to `System.Collections.Immutable`. * DEP: Update `Microsoft.Data.SqlClient` reference from 2.1.2 to 2.1.7 in `WorkItems` and `Sarif.Multitool.Library` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7). * DEP: Update `System.Data.SqlClient` reference from 4.8.5 to 4.8.6 in `WorkItems` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7). +* BUG: Improve `FileEncoding.IsTextualData` method to detect binary files. * BUG: Update `Stack.Create` method to populate missing `PhysicalLocation` instances when stack frames reference relative file paths. * BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`. * BUG: Fix `MultithreadedAnalyzeCommandBase` to return rich return code with the `--rich-return-code` option. diff --git a/src/Sarif/FileEncoding.cs b/src/Sarif/FileEncoding.cs index 959168ac6..82f5c76df 100644 --- a/src/Sarif/FileEncoding.cs +++ b/src/Sarif/FileEncoding.cs @@ -2,12 +2,8 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; -using System.IO; -using System.Linq; -using System.Runtime.InteropServices; using System.Text; -using FastSerialization; namespace Microsoft.CodeAnalysis.Sarif { @@ -28,16 +24,16 @@ public static bool IsTextualData(byte[] bytes) /// /// Detects if the provided byte array contains textual data. The heuristic applied - /// is to allow .NET to attempt to decode the byte array as a sequence of characters. - /// If that decoding generates does not generate a Unicode replacement character, - /// the data is classified as binary rather than text. + /// is to first examine the data for any control characters (<0x20). If none whatsoever + /// are found, the data is classified as textual. Otherwise, if the inspected bytes + /// decode successfully as either Unicode or UTF32, the data is classified as textual. + /// This helper does not currently properly handled detected UnicodeBigEndian data. /// /// The raw data expressed as bytes. /// The starting position to being classification. /// The maximal count of characters to decode. public static bool IsTextualData(byte[] bytes, int start, int count) { - Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252); bytes = bytes ?? throw new ArgumentNullException(nameof(bytes)); if (start >= bytes.Length) @@ -45,62 +41,38 @@ public static bool IsTextualData(byte[] bytes, int start, int count) throw new ArgumentOutOfRangeException(nameof(start), $"Buffer size ({bytes.Length}) not valid for start ({start}) argument."); } - foreach (Encoding encoding in new[] { Encoding.UTF8, Windows1252, Encoding.UTF32 }) - { - if (count < bytes.Length) - { - var span = new ReadOnlySpan(bytes, start, count); - bytes = span.ToArray(); - } - using var reader = new StreamReader(new MemoryStream(bytes), encoding, detectEncodingFromByteOrderMarks: true); - reader.BaseStream.Seek(start, SeekOrigin.Begin); + Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252); - bool isTextual = true; - bool continueProcessing = true; + bool containsControlCharacters = false; - for (int i = start; i < count; i++) - { - int ch = reader.Read(); + for (int i = 0; i < bytes.Length; i++) + { + containsControlCharacters |= bytes[i] < 0x20; + } - if (ch == -1) - { - break; - } + if (!containsControlCharacters) + { + return true; + } - // Because we enable 'detectEncodingFromByteOrderMarks' we will skip past any NUL - // characters in the file that might result from being BOM-prefixed. So any - // evidence of this character is an indicator of binary data. - if (encoding != Encoding.UTF8 && ch == '\0') - { - // This condition indicates binary data in all cases, when encountered for - // Windows 1252 or UTF32. For UTF8, we determine data is binary by observing - // that a character has been dropped in favor of the Unicode replacement char. - isTextual = false; - continueProcessing = false; - break; - } + foreach (Encoding encoding in new[] { Encoding.UTF32, Encoding.Unicode }) + { + bool encodingSucceeded = true; - // Unicode REPLACEMENT CHARACTER (U+FFFD) - if (encoding == Encoding.UTF8 && ch == 65533) + foreach (char c in encoding.GetChars(bytes, start, count)) + { + if (c == 0xfffd) { - isTextual = false; + encodingSucceeded = false; break; } } - if (!continueProcessing) - { - return isTextual; - } - - if (isTextual) + if (encodingSucceeded) { return true; } - - // In this code path, a single encoding determined that the data was *not* textual, - // but we think we should continue to examine other text encodings to see the result. } return false; diff --git a/src/Test.UnitTests.Sarif/FileEncodingTests.cs b/src/Test.UnitTests.Sarif/FileEncodingTests.cs index 906f435db..afd655d02 100644 --- a/src/Test.UnitTests.Sarif/FileEncodingTests.cs +++ b/src/Test.UnitTests.Sarif/FileEncodingTests.cs @@ -3,11 +3,13 @@ using System; using System.Collections.Generic; -using System.Collections.Immutable; using System.IO; using System.Text; using FluentAssertions; +using FluentAssertions.Execution; + +using Microsoft.CodeAnalysis.Sarif.Driver; using Xunit; @@ -32,42 +34,74 @@ public void FileEncoding_StartExceedsBufferLength() [Fact] public void FileEncoding_FileEncoding_IsBinary() { - ValidateIsBinary("Sarif.dll"); - ValidateIsBinary("Sarif.pdb"); + var assertionScope = new AssertionScope(); + + var binaryExtensions = new HashSet(StringComparer.OrdinalIgnoreCase) + { + ".cer", + ".der", + ".pfx", + ".dll", + ".exe", + }; + + var observedExtensions = new HashSet(StringComparer.OrdinalIgnoreCase); + + var fileSpecifier = new FileSpecifier("*", recurse: false); + + foreach (string fileName in fileSpecifier.Files) + { + string extension = Path.GetExtension(fileName); + if (observedExtensions.Contains(extension)) { continue; } + observedExtensions.Add(extension); + + using FileStream reader = File.OpenRead(fileName); + int bufferSize = 1024; + byte[] bytes = new byte[bufferSize]; + reader.Read(bytes, 0, bufferSize); + bool isTextual = FileEncoding.IsTextualData(bytes); + + if (!isTextual) + { + binaryExtensions.Should().Contain(extension, because: $"'{fileName}' was classified as a binary file"); + } + } } private void ValidateIsBinary(string fileName) { + if (!File.Exists(fileName)) + { + fileName = Path.Combine(Environment.CurrentDirectory, fileName); + } + fileName = Path.Combine(Environment.CurrentDirectory, fileName); using FileStream reader = File.OpenRead(fileName); int bufferSize = 1024; byte[] bytes = new byte[bufferSize]; reader.Read(bytes, 0, bufferSize); - FileEncoding.IsTextualData(bytes).Should().BeFalse(); + FileEncoding.IsTextualData(bytes).Should().BeFalse(because: $"{fileName} is a binary file"); } [Fact] public void FileEncoding_UnicodeDataIsTextual() { + using var assertionScope = new AssertionScope(); + var sb = new StringBuilder(); string unicodeText = "американец"; foreach (Encoding encoding in new[] { Encoding.Unicode, Encoding.UTF8, Encoding.BigEndianUnicode, Encoding.UTF32 }) { byte[] input = encoding.GetBytes(unicodeText); - if (!FileEncoding.IsTextualData(input)) - { - sb.AppendLine($"\tThe '{encoding.EncodingName}' encoding classified unicode text '{unicodeText}' as binary data."); - } + FileEncoding.IsTextualData(input).Should().BeTrue(because: $"'{unicodeText}' encoded as '{encoding.EncodingName}' should not be classified as binary data"); } - - sb.Length.Should().Be(0, because: $"all unicode strings should be classified as textual:{Environment.NewLine}{sb}"); } [Fact] public void FileEncoding_BinaryDataIsBinary() { - var sb = new StringBuilder(); + using var assertionScope = new AssertionScope(); foreach (string binaryName in new[] { "Certificate.cer", "Certificate.der", "PasswordProtected.pfx" }) { @@ -80,13 +114,8 @@ public void FileEncoding_BinaryDataIsBinary() Stream = resource, }; - if (artifact.Bytes == null) - { - sb.AppendLine($"\tBinary file '{binaryName}' was classified as textual data."); - } + artifact.Bytes.Should().NotBeNull(because: $"'{binaryName}' should be classified as binary data."); } - - sb.Length.Should().Be(0, because: $"all binary files should be classified as binary:{Environment.NewLine}{sb}"); } [Fact] @@ -104,7 +133,8 @@ public void FileEncoding_AllUtf8EncodingsAreTextual() private static void ValidateEncoding(string encodingName, Encoding encoding) { - var sb = new StringBuilder(65536 * 100); + using var assertionScope = new AssertionScope(); + Stream resource = typeof(FileEncodingTests).Assembly.GetManifestResourceStream($"Test.UnitTests.Sarif.TestData.FileEncoding.{encodingName}.txt"); using var reader = new StreamReader(resource, Encoding.UTF8); @@ -113,14 +143,12 @@ private static void ValidateEncoding(string encodingName, Encoding encoding) { char ch = (char)current; byte[] input = encoding.GetBytes(new[] { ch }); - if (!FileEncoding.IsTextualData(input)) - { - string unicodeText = "\\u" + ((int)ch).ToString("d4"); - sb.AppendLine($"\t{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) was classified as binary data."); - } - } - sb.Length.Should().Be(0, because: $"all {encodingName} encodable character should be classified as textual:{Environment.NewLine}{sb}"); + if (ch < 0x20) { continue; } + + string unicodeText = "\\u" + ((int)ch).ToString("d4"); + FileEncoding.IsTextualData(input).Should().BeTrue(because: $"{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) should not classify as binary data."); + } } } }