diff --git a/ReleaseHistory.md b/ReleaseHistory.md
index 991ae3374..2a0210306 100644
--- a/ReleaseHistory.md
+++ b/ReleaseHistory.md
@@ -5,6 +5,7 @@
* DEP: Remove spurious references to `System.Collections.Immutable`.
* DEP: Update `Microsoft.Data.SqlClient` reference from 2.1.2 to 2.1.7 in `WorkItems` and `Sarif.Multitool.Library` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* DEP: Update `System.Data.SqlClient` reference from 4.8.5 to 4.8.6 in `WorkItems` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
+* BUG: Improve `FileEncoding.IsTextualData` method to detect binary files.
* BUG: Update `Stack.Create` method to populate missing `PhysicalLocation` instances when stack frames reference relative file paths.
* BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`.
* BUG: Fix `MultithreadedAnalyzeCommandBase` to return rich return code with the `--rich-return-code` option.
diff --git a/src/Sarif/FileEncoding.cs b/src/Sarif/FileEncoding.cs
index 959168ac6..82f5c76df 100644
--- a/src/Sarif/FileEncoding.cs
+++ b/src/Sarif/FileEncoding.cs
@@ -2,12 +2,8 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
-using System.IO;
-using System.Linq;
-using System.Runtime.InteropServices;
using System.Text;
-using FastSerialization;
namespace Microsoft.CodeAnalysis.Sarif
{
@@ -28,16 +24,16 @@ public static bool IsTextualData(byte[] bytes)
///
/// Detects if the provided byte array contains textual data. The heuristic applied
- /// is to allow .NET to attempt to decode the byte array as a sequence of characters.
- /// If that decoding generates does not generate a Unicode replacement character,
- /// the data is classified as binary rather than text.
+ /// is to first examine the data for any control characters (<0x20). If none whatsoever
+ /// are found, the data is classified as textual. Otherwise, if the inspected bytes
+ /// decode successfully as either Unicode or UTF32, the data is classified as textual.
+ /// This helper does not currently properly handled detected UnicodeBigEndian data.
///
/// The raw data expressed as bytes.
/// The starting position to being classification.
/// The maximal count of characters to decode.
public static bool IsTextualData(byte[] bytes, int start, int count)
{
- Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);
bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));
if (start >= bytes.Length)
@@ -45,62 +41,38 @@ public static bool IsTextualData(byte[] bytes, int start, int count)
throw new ArgumentOutOfRangeException(nameof(start), $"Buffer size ({bytes.Length}) not valid for start ({start}) argument.");
}
- foreach (Encoding encoding in new[] { Encoding.UTF8, Windows1252, Encoding.UTF32 })
- {
- if (count < bytes.Length)
- {
- var span = new ReadOnlySpan(bytes, start, count);
- bytes = span.ToArray();
- }
- using var reader = new StreamReader(new MemoryStream(bytes), encoding, detectEncodingFromByteOrderMarks: true);
- reader.BaseStream.Seek(start, SeekOrigin.Begin);
+ Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);
- bool isTextual = true;
- bool continueProcessing = true;
+ bool containsControlCharacters = false;
- for (int i = start; i < count; i++)
- {
- int ch = reader.Read();
+ for (int i = 0; i < bytes.Length; i++)
+ {
+ containsControlCharacters |= bytes[i] < 0x20;
+ }
- if (ch == -1)
- {
- break;
- }
+ if (!containsControlCharacters)
+ {
+ return true;
+ }
- // Because we enable 'detectEncodingFromByteOrderMarks' we will skip past any NUL
- // characters in the file that might result from being BOM-prefixed. So any
- // evidence of this character is an indicator of binary data.
- if (encoding != Encoding.UTF8 && ch == '\0')
- {
- // This condition indicates binary data in all cases, when encountered for
- // Windows 1252 or UTF32. For UTF8, we determine data is binary by observing
- // that a character has been dropped in favor of the Unicode replacement char.
- isTextual = false;
- continueProcessing = false;
- break;
- }
+ foreach (Encoding encoding in new[] { Encoding.UTF32, Encoding.Unicode })
+ {
+ bool encodingSucceeded = true;
- // Unicode REPLACEMENT CHARACTER (U+FFFD)
- if (encoding == Encoding.UTF8 && ch == 65533)
+ foreach (char c in encoding.GetChars(bytes, start, count))
+ {
+ if (c == 0xfffd)
{
- isTextual = false;
+ encodingSucceeded = false;
break;
}
}
- if (!continueProcessing)
- {
- return isTextual;
- }
-
- if (isTextual)
+ if (encodingSucceeded)
{
return true;
}
-
- // In this code path, a single encoding determined that the data was *not* textual,
- // but we think we should continue to examine other text encodings to see the result.
}
return false;
diff --git a/src/Test.UnitTests.Sarif/FileEncodingTests.cs b/src/Test.UnitTests.Sarif/FileEncodingTests.cs
index 906f435db..afd655d02 100644
--- a/src/Test.UnitTests.Sarif/FileEncodingTests.cs
+++ b/src/Test.UnitTests.Sarif/FileEncodingTests.cs
@@ -3,11 +3,13 @@
using System;
using System.Collections.Generic;
-using System.Collections.Immutable;
using System.IO;
using System.Text;
using FluentAssertions;
+using FluentAssertions.Execution;
+
+using Microsoft.CodeAnalysis.Sarif.Driver;
using Xunit;
@@ -32,42 +34,74 @@ public void FileEncoding_StartExceedsBufferLength()
[Fact]
public void FileEncoding_FileEncoding_IsBinary()
{
- ValidateIsBinary("Sarif.dll");
- ValidateIsBinary("Sarif.pdb");
+ var assertionScope = new AssertionScope();
+
+ var binaryExtensions = new HashSet(StringComparer.OrdinalIgnoreCase)
+ {
+ ".cer",
+ ".der",
+ ".pfx",
+ ".dll",
+ ".exe",
+ };
+
+ var observedExtensions = new HashSet(StringComparer.OrdinalIgnoreCase);
+
+ var fileSpecifier = new FileSpecifier("*", recurse: false);
+
+ foreach (string fileName in fileSpecifier.Files)
+ {
+ string extension = Path.GetExtension(fileName);
+ if (observedExtensions.Contains(extension)) { continue; }
+ observedExtensions.Add(extension);
+
+ using FileStream reader = File.OpenRead(fileName);
+ int bufferSize = 1024;
+ byte[] bytes = new byte[bufferSize];
+ reader.Read(bytes, 0, bufferSize);
+ bool isTextual = FileEncoding.IsTextualData(bytes);
+
+ if (!isTextual)
+ {
+ binaryExtensions.Should().Contain(extension, because: $"'{fileName}' was classified as a binary file");
+ }
+ }
}
private void ValidateIsBinary(string fileName)
{
+ if (!File.Exists(fileName))
+ {
+ fileName = Path.Combine(Environment.CurrentDirectory, fileName);
+ }
+
fileName = Path.Combine(Environment.CurrentDirectory, fileName);
using FileStream reader = File.OpenRead(fileName);
int bufferSize = 1024;
byte[] bytes = new byte[bufferSize];
reader.Read(bytes, 0, bufferSize);
- FileEncoding.IsTextualData(bytes).Should().BeFalse();
+ FileEncoding.IsTextualData(bytes).Should().BeFalse(because: $"{fileName} is a binary file");
}
[Fact]
public void FileEncoding_UnicodeDataIsTextual()
{
+ using var assertionScope = new AssertionScope();
+
var sb = new StringBuilder();
string unicodeText = "американец";
foreach (Encoding encoding in new[] { Encoding.Unicode, Encoding.UTF8, Encoding.BigEndianUnicode, Encoding.UTF32 })
{
byte[] input = encoding.GetBytes(unicodeText);
- if (!FileEncoding.IsTextualData(input))
- {
- sb.AppendLine($"\tThe '{encoding.EncodingName}' encoding classified unicode text '{unicodeText}' as binary data.");
- }
+ FileEncoding.IsTextualData(input).Should().BeTrue(because: $"'{unicodeText}' encoded as '{encoding.EncodingName}' should not be classified as binary data");
}
-
- sb.Length.Should().Be(0, because: $"all unicode strings should be classified as textual:{Environment.NewLine}{sb}");
}
[Fact]
public void FileEncoding_BinaryDataIsBinary()
{
- var sb = new StringBuilder();
+ using var assertionScope = new AssertionScope();
foreach (string binaryName in new[] { "Certificate.cer", "Certificate.der", "PasswordProtected.pfx" })
{
@@ -80,13 +114,8 @@ public void FileEncoding_BinaryDataIsBinary()
Stream = resource,
};
- if (artifact.Bytes == null)
- {
- sb.AppendLine($"\tBinary file '{binaryName}' was classified as textual data.");
- }
+ artifact.Bytes.Should().NotBeNull(because: $"'{binaryName}' should be classified as binary data.");
}
-
- sb.Length.Should().Be(0, because: $"all binary files should be classified as binary:{Environment.NewLine}{sb}");
}
[Fact]
@@ -104,7 +133,8 @@ public void FileEncoding_AllUtf8EncodingsAreTextual()
private static void ValidateEncoding(string encodingName, Encoding encoding)
{
- var sb = new StringBuilder(65536 * 100);
+ using var assertionScope = new AssertionScope();
+
Stream resource = typeof(FileEncodingTests).Assembly.GetManifestResourceStream($"Test.UnitTests.Sarif.TestData.FileEncoding.{encodingName}.txt");
using var reader = new StreamReader(resource, Encoding.UTF8);
@@ -113,14 +143,12 @@ private static void ValidateEncoding(string encodingName, Encoding encoding)
{
char ch = (char)current;
byte[] input = encoding.GetBytes(new[] { ch });
- if (!FileEncoding.IsTextualData(input))
- {
- string unicodeText = "\\u" + ((int)ch).ToString("d4");
- sb.AppendLine($"\t{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) was classified as binary data.");
- }
- }
- sb.Length.Should().Be(0, because: $"all {encodingName} encodable character should be classified as textual:{Environment.NewLine}{sb}");
+ if (ch < 0x20) { continue; }
+
+ string unicodeText = "\\u" + ((int)ch).ToString("d4");
+ FileEncoding.IsTextualData(input).Should().BeTrue(because: $"{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) should not classify as binary data.");
+ }
}
}
}