Skip to content

Commit

Permalink
Merge branch 'main' into users/cmeyer/multitool-validation-rules
Browse files Browse the repository at this point in the history
  • Loading branch information
shaopeng-gh authored Feb 5, 2024
2 parents c2a630b + f056692 commit 728e6f4
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 76 deletions.
1 change: 1 addition & 0 deletions ReleaseHistory.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* DEP: Remove spurious references to `System.Collections.Immutable`.
* DEP: Update `Microsoft.Data.SqlClient` reference from 2.1.2 to 2.1.7 in `WorkItems` and `Sarif.Multitool.Library` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* DEP: Update `System.Data.SqlClient` reference from 4.8.5 to 4.8.6 in `WorkItems` to resolve [CVE-2024-0056](https://github.com/advisories/GHSA-98g6-xh36-x2p7).
* BUG: Improve `FileEncoding.IsTextualData` method to detect binary files.
* BUG: Update `Stack.Create` method to populate missing `PhysicalLocation` instances when stack frames reference relative file paths.
* BUG: Fix `UnsupportedOperationException` in `ZipArchiveArtifact`.
* BUG: Fix `MultithreadedAnalyzeCommandBase` to return rich return code with the `--rich-return-code` option.
Expand Down
72 changes: 22 additions & 50 deletions src/Sarif/FileEncoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,8 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.

using System;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;

using FastSerialization;

namespace Microsoft.CodeAnalysis.Sarif
{
Expand All @@ -28,79 +24,55 @@ public static bool IsTextualData(byte[] bytes)

/// <summary>
/// Detects if the provided byte array contains textual data. The heuristic applied
/// is to allow .NET to attempt to decode the byte array as a sequence of characters.
/// If that decoding generates does not generate a Unicode replacement character,
/// the data is classified as binary rather than text.
/// is to first examine the data for any control characters (<0x20). If none whatsoever
/// are found, the data is classified as textual. Otherwise, if the inspected bytes
/// decode successfully as either Unicode or UTF32, the data is classified as textual.
/// This helper does not currently properly handled detected UnicodeBigEndian data.
/// </summary>
/// <param name="bytes">The raw data expressed as bytes.</param>
/// <param name="start">The starting position to being classification.</param>
/// <param name="count">The maximal count of characters to decode.</param>
public static bool IsTextualData(byte[] bytes, int start, int count)
{
Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);
bytes = bytes ?? throw new ArgumentNullException(nameof(bytes));

if (start >= bytes.Length)
{
throw new ArgumentOutOfRangeException(nameof(start), $"Buffer size ({bytes.Length}) not valid for start ({start}) argument.");
}

foreach (Encoding encoding in new[] { Encoding.UTF8, Windows1252, Encoding.UTF32 })
{
if (count < bytes.Length)
{
var span = new ReadOnlySpan<byte>(bytes, start, count);
bytes = span.ToArray();
}

using var reader = new StreamReader(new MemoryStream(bytes), encoding, detectEncodingFromByteOrderMarks: true);
reader.BaseStream.Seek(start, SeekOrigin.Begin);
Windows1252 = Windows1252 ?? Encoding.GetEncoding(1252);

bool isTextual = true;
bool continueProcessing = true;
bool containsControlCharacters = false;

for (int i = start; i < count; i++)
{
int ch = reader.Read();
for (int i = 0; i < bytes.Length; i++)
{
containsControlCharacters |= bytes[i] < 0x20;
}

if (ch == -1)
{
break;
}
if (!containsControlCharacters)
{
return true;
}

// Because we enable 'detectEncodingFromByteOrderMarks' we will skip past any NUL
// characters in the file that might result from being BOM-prefixed. So any
// evidence of this character is an indicator of binary data.
if (encoding != Encoding.UTF8 && ch == '\0')
{
// This condition indicates binary data in all cases, when encountered for
// Windows 1252 or UTF32. For UTF8, we determine data is binary by observing
// that a character has been dropped in favor of the Unicode replacement char.
isTextual = false;
continueProcessing = false;
break;
}
foreach (Encoding encoding in new[] { Encoding.UTF32, Encoding.Unicode })
{
bool encodingSucceeded = true;

// Unicode REPLACEMENT CHARACTER (U+FFFD)
if (encoding == Encoding.UTF8 && ch == 65533)
foreach (char c in encoding.GetChars(bytes, start, count))
{
if (c == 0xfffd)
{
isTextual = false;
encodingSucceeded = false;
break;
}
}

if (!continueProcessing)
{
return isTextual;
}

if (isTextual)
if (encodingSucceeded)
{
return true;
}

// In this code path, a single encoding determined that the data was *not* textual,
// but we think we should continue to examine other text encodings to see the result.
}

return false;
Expand Down
5 changes: 4 additions & 1 deletion src/Sarif/Sarif.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@
<PackageReference Include="Microsoft.Diagnostics.Tracing.TraceEvent" Version="3.1.3" />
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.1.1" PrivateAssets="All" />
<PackageReference Include="System.Collections.Immutable" Version="5.0.0" />
<PackageReference Include="System.Diagnostics.Debug" Version="4.3.0" />
<PackageReference Include="System.IO.FileSystem.Primitives" Version="4.3.0" />
<PackageReference Include="System.Text.Encoding.CodePages" Version="4.3.0" />
<PackageReference Include="System.Text.Encoding.Extensions" Version="4.3.0" />
</ItemGroup>

<ItemGroup Label="Packaging">
Expand All @@ -66,7 +69,7 @@
<PropertyGroup>
<EtwManifestForceAll>true</EtwManifestForceAll>
</PropertyGroup>

<ItemGroup>
<JsonSchemaFile Include="$(MsBuildThisFileDirectory)Schemata\sarif-$(SchemaVersionAsPublishedToSchemaStoreOrg).json">
<Namespace>$(RootNamespace)</Namespace>
Expand Down
78 changes: 53 additions & 25 deletions src/Test.UnitTests.Sarif/FileEncodingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.IO;
using System.Text;

using FluentAssertions;
using FluentAssertions.Execution;

using Microsoft.CodeAnalysis.Sarif.Driver;

using Xunit;

Expand All @@ -32,42 +34,74 @@ public void FileEncoding_StartExceedsBufferLength()
[Fact]
public void FileEncoding_FileEncoding_IsBinary()
{
ValidateIsBinary("Sarif.dll");
ValidateIsBinary("Sarif.pdb");
var assertionScope = new AssertionScope();

var binaryExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
".cer",
".der",
".pfx",
".dll",
".exe",
};

var observedExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase);

var fileSpecifier = new FileSpecifier("*", recurse: false);

foreach (string fileName in fileSpecifier.Files)
{
string extension = Path.GetExtension(fileName);
if (observedExtensions.Contains(extension)) { continue; }
observedExtensions.Add(extension);

using FileStream reader = File.OpenRead(fileName);
int bufferSize = 1024;
byte[] bytes = new byte[bufferSize];
reader.Read(bytes, 0, bufferSize);
bool isTextual = FileEncoding.IsTextualData(bytes);

if (!isTextual)
{
binaryExtensions.Should().Contain(extension, because: $"'{fileName}' was classified as a binary file");
}
}
}

private void ValidateIsBinary(string fileName)
{
if (!File.Exists(fileName))
{
fileName = Path.Combine(Environment.CurrentDirectory, fileName);
}

fileName = Path.Combine(Environment.CurrentDirectory, fileName);
using FileStream reader = File.OpenRead(fileName);
int bufferSize = 1024;
byte[] bytes = new byte[bufferSize];
reader.Read(bytes, 0, bufferSize);
FileEncoding.IsTextualData(bytes).Should().BeFalse();
FileEncoding.IsTextualData(bytes).Should().BeFalse(because: $"{fileName} is a binary file");
}

[Fact]
public void FileEncoding_UnicodeDataIsTextual()
{
using var assertionScope = new AssertionScope();

var sb = new StringBuilder();
string unicodeText = "американец";

foreach (Encoding encoding in new[] { Encoding.Unicode, Encoding.UTF8, Encoding.BigEndianUnicode, Encoding.UTF32 })
{
byte[] input = encoding.GetBytes(unicodeText);
if (!FileEncoding.IsTextualData(input))
{
sb.AppendLine($"\tThe '{encoding.EncodingName}' encoding classified unicode text '{unicodeText}' as binary data.");
}
FileEncoding.IsTextualData(input).Should().BeTrue(because: $"'{unicodeText}' encoded as '{encoding.EncodingName}' should not be classified as binary data");
}

sb.Length.Should().Be(0, because: $"all unicode strings should be classified as textual:{Environment.NewLine}{sb}");
}

[Fact]
public void FileEncoding_BinaryDataIsBinary()
{
var sb = new StringBuilder();
using var assertionScope = new AssertionScope();

foreach (string binaryName in new[] { "Certificate.cer", "Certificate.der", "PasswordProtected.pfx" })
{
Expand All @@ -80,13 +114,8 @@ public void FileEncoding_BinaryDataIsBinary()
Stream = resource,
};

if (artifact.Bytes == null)
{
sb.AppendLine($"\tBinary file '{binaryName}' was classified as textual data.");
}
artifact.Bytes.Should().NotBeNull(because: $"'{binaryName}' should be classified as binary data.");
}

sb.Length.Should().Be(0, because: $"all binary files should be classified as binary:{Environment.NewLine}{sb}");
}

[Fact]
Expand All @@ -104,7 +133,8 @@ public void FileEncoding_AllUtf8EncodingsAreTextual()

private static void ValidateEncoding(string encodingName, Encoding encoding)
{
var sb = new StringBuilder(65536 * 100);
using var assertionScope = new AssertionScope();

Stream resource = typeof(FileEncodingTests).Assembly.GetManifestResourceStream($"Test.UnitTests.Sarif.TestData.FileEncoding.{encodingName}.txt");
using var reader = new StreamReader(resource, Encoding.UTF8);

Expand All @@ -113,14 +143,12 @@ private static void ValidateEncoding(string encodingName, Encoding encoding)
{
char ch = (char)current;
byte[] input = encoding.GetBytes(new[] { ch });
if (!FileEncoding.IsTextualData(input))
{
string unicodeText = "\\u" + ((int)ch).ToString("d4");
sb.AppendLine($"\t{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) was classified as binary data.");
}
}

sb.Length.Should().Be(0, because: $"all {encodingName} encodable character should be classified as textual:{Environment.NewLine}{sb}");
if (ch < 0x20) { continue; }

string unicodeText = "\\u" + ((int)ch).ToString("d4");
FileEncoding.IsTextualData(input).Should().BeTrue(because: $"{encodingName} character '{unicodeText}' ({encoding.GetString(input)}) should not classify as binary data.");
}
}
}
}

0 comments on commit 728e6f4

Please sign in to comment.