-
Notifications
You must be signed in to change notification settings - Fork 92
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add binary vs. textual file enumeration. #2739
Changes from 3 commits
cf43254
ee6e98f
4dc72e7
e82f591
3162333
bec5f00
4a03e56
677d01d
59329d0
2fd45b3
06c865a
0908d50
5d252fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,6 @@ | |
using System.Collections.Generic; | ||
using System.IO; | ||
using System.IO.Compression; | ||
using System.Text; | ||
|
||
namespace Microsoft.CodeAnalysis.Sarif | ||
{ | ||
|
@@ -25,108 +24,4 @@ public ArtifactProvider(IEnumerable<IEnumeratedArtifact> artifacts) | |
|
||
public IFileSystem FileSystem { get; set; } | ||
} | ||
|
||
public class SinglethreadedZipArchiveArtifactProvider : ArtifactProvider | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
{ | ||
public SinglethreadedZipArchiveArtifactProvider(ZipArchive zipArchive, IFileSystem fileSystem) : base(fileSystem) | ||
{ | ||
var artifacts = new List<IEnumeratedArtifact>(); | ||
|
||
foreach (ZipArchiveEntry entry in zipArchive.Entries) | ||
{ | ||
artifacts.Add(new EnumeratedArtifact(Sarif.FileSystem.Instance) | ||
{ | ||
Uri = new Uri(entry.FullName, UriKind.RelativeOrAbsolute), | ||
Contents = new StreamReader(entry.Open()).ReadToEnd() | ||
}); | ||
} | ||
|
||
Artifacts = artifacts; | ||
} | ||
} | ||
|
||
public class MultithreadedZipArchiveArtifactProvider : ArtifactProvider | ||
{ | ||
private readonly ZipArchive zipArchive; | ||
|
||
public MultithreadedZipArchiveArtifactProvider(ZipArchive zipArchive, IFileSystem fileSystem) : base(fileSystem) | ||
{ | ||
this.zipArchive = zipArchive; | ||
} | ||
|
||
public override IEnumerable<IEnumeratedArtifact> Artifacts | ||
{ | ||
get | ||
{ | ||
foreach (ZipArchiveEntry entry in this.zipArchive.Entries) | ||
{ | ||
yield return new ZipArchiveArtifact(this.zipArchive, entry); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public class ZipArchiveArtifact : IEnumeratedArtifact | ||
{ | ||
private ZipArchiveEntry entry; | ||
private readonly ZipArchive archive; | ||
private readonly Uri uri; | ||
private string contents; | ||
|
||
public ZipArchiveArtifact(ZipArchive archive, ZipArchiveEntry entry) | ||
{ | ||
this.entry = entry; | ||
this.archive = archive; | ||
this.uri = new Uri(entry.FullName, UriKind.RelativeOrAbsolute); | ||
} | ||
|
||
public Uri Uri => this.uri; | ||
|
||
public Stream Stream | ||
{ | ||
get | ||
{ | ||
lock (this.archive) | ||
{ | ||
return entry != null | ||
? entry.Open() | ||
: null; | ||
} | ||
} | ||
set => throw new NotImplementedException(); | ||
} | ||
|
||
public Encoding Encoding { get => throw new NotImplementedException(); set => throw new NotImplementedException(); } | ||
|
||
public string Contents | ||
{ | ||
get | ||
{ | ||
lock (this.archive) | ||
{ | ||
if (this.contents != null) { return this.contents; } | ||
this.contents = new StreamReader(Stream).ReadToEnd(); | ||
this.entry = null; | ||
return this.contents; | ||
} | ||
} | ||
set => throw new NotImplementedException(); | ||
} | ||
|
||
public long? SizeInBytes | ||
{ | ||
get | ||
{ | ||
lock (this.archive) | ||
{ | ||
if (this.entry != null) | ||
{ | ||
return this.entry.Length; | ||
} | ||
return this.contents?.Length; | ||
} | ||
} | ||
set => throw new NotImplementedException(); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,44 +16,152 @@ public EnumeratedArtifact(IFileSystem fileSystem) | |
FileSystem = fileSystem; | ||
} | ||
|
||
internal byte[] bytes; | ||
internal string contents; | ||
|
||
private bool? isBinary; | ||
private Encoding encoding; | ||
|
||
public Uri Uri { get; set; } | ||
|
||
public bool IsBinary | ||
{ | ||
get | ||
{ | ||
return this.isBinary ?? IsArtifactBinary(); | ||
} | ||
} | ||
|
||
internal bool IsArtifactBinary() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggest: make private to narrow API surface to IsBinary and focus internal use/experience on the more public version? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'internal' is still private, i tend to do this to make things testable but not public. As it happens I can now delete this helper entirely. It now occurs to me: this new property value is untested!! Need to add it to the test validations. So I'm glad you pointed to it. :) |
||
{ | ||
if (isBinary.HasValue) | ||
{ | ||
return isBinary.Value; | ||
} | ||
|
||
GetArtifactData(); | ||
this.isBinary = this.contents == null; | ||
return this.isBinary.Value; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Combining the last two statements, will the return value be null? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! and just delete the backing field. |
||
} | ||
|
||
public Stream Stream { get; set; } | ||
|
||
public Encoding Encoding { get; set; } | ||
public Encoding Encoding | ||
{ | ||
get | ||
{ | ||
if (encoding == null) | ||
{ | ||
GetArtifactData(); | ||
} | ||
return this.encoding; | ||
} | ||
|
||
set => this.encoding = value; | ||
} | ||
|
||
internal IFileSystem FileSystem { get; set; } | ||
|
||
/// <summary> | ||
/// Gets or sets a property that determines whether the enumerated artifact | ||
/// can obtain data from a non-seekable stream. This operation will be | ||
/// less performant in any case where an artifact is a textual file (as | ||
/// the file will first be converted to a byte array to determine | ||
/// whether it is textual and subsequently turned into a string if it is). | ||
/// </summary> | ||
public bool SupportNonSeekableStreams { get; set; } | ||
|
||
public string Contents | ||
{ | ||
get => GetContents(); | ||
get => GetArtifactData().text; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the core change. On any attempt to access a property that returns artifact contents, we attempt to fault in the data. This may entail reading the contents from disk or a stream. We now sniff the data to determine whether it is binary or textual and populate |
||
set => this.contents = value; | ||
} | ||
|
||
private string GetContents() | ||
public byte[] Bytes | ||
{ | ||
get => GetArtifactData().bytes; | ||
set => this.bytes = value; | ||
} | ||
|
||
private (string text, byte[] bytes) GetArtifactData() | ||
{ | ||
if (this.contents != null) { return this.contents; } | ||
if (this.contents != null) | ||
{ | ||
return (this.contents, bytes: null); | ||
} | ||
|
||
if (this.bytes != null) | ||
{ | ||
return (text: null, this.bytes); | ||
} | ||
|
||
if (Stream == null && this.contents == null && this.bytes == null) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. made this consistent for this file. We should discuss this style point, whether properties have a |
||
{ | ||
if (Uri == null || (Uri.IsAbsoluteUri && !Uri.IsFile)) | ||
{ | ||
throw new InvalidOperationException(); | ||
} | ||
|
||
// This is our client-side, disk-based file retrieval case. | ||
|
||
if (Stream == null && this.contents == null) | ||
Stream = FileSystem.FileOpenRead(Uri.LocalPath); | ||
} | ||
|
||
bool isText = false; | ||
|
||
if (Stream.CanSeek) | ||
{ | ||
// TBD we actually have no validation URI is non-null yet. | ||
contents = Uri!.IsFile | ||
? FileSystem.FileReadAllText(Uri.LocalPath) | ||
: null; | ||
// Reset to beginning of stream in case caller neglected to do so. | ||
this.Stream.Seek(0, SeekOrigin.Begin); | ||
|
||
byte[] header = new byte[4096]; | ||
int length = this.Stream.Read(header, 0, header.Length); | ||
isText = FileEncoding.CheckForTextualData(header, 0, length, out this.encoding); | ||
|
||
if (isText) | ||
{ | ||
// If we have textual data and the encoding was null, we are UTF8 | ||
// (which will be a perfectly valid encoding for ASCII as well). | ||
this.encoding ??= Encoding.UTF8; | ||
} | ||
|
||
this.sizeInBytes = (long?)this.contents?.Length; | ||
this.Stream.Seek(0, SeekOrigin.Begin); | ||
|
||
if (isText) | ||
{ | ||
using var contentReader = new StreamReader(Stream); | ||
this.contents = contentReader.ReadToEnd(); | ||
} | ||
else | ||
{ | ||
this.bytes = new byte[Stream.Length]; | ||
this.Stream.Read(this.bytes, 0, bytes.Length); | ||
} | ||
} | ||
else | ||
{ | ||
if (Stream.CanSeek) { this.Stream.Seek(0, SeekOrigin.Begin); } | ||
using var contentReader = new StreamReader(Stream); | ||
this.contents = contentReader.ReadToEnd(); | ||
Stream = null; | ||
if (!SupportNonSeekableStreams) | ||
{ | ||
throw new InvalidOperationException("Stream is not seekable. Provide a seekable stream or set the 'SupportNonSeekableStreams' property."); | ||
} | ||
|
||
this.bytes = new byte[Stream.Length]; | ||
int length = this.Stream.Read(this.bytes, 0, this.bytes.Length); | ||
isText = FileEncoding.CheckForTextualData(this.bytes, 0, length, out this.encoding); | ||
|
||
if (isText) | ||
{ | ||
// If we have textual data and the encoding was null, we are UTF8 | ||
// (which will be a perfectly valid encoding for ASCII as well). | ||
this.encoding ??= Encoding.UTF8; | ||
this.contents = encoding.GetString(this.bytes); | ||
this.bytes = null; | ||
} | ||
} | ||
|
||
return this.contents; | ||
Stream = null; | ||
|
||
return (this.contents, this.bytes); | ||
} | ||
|
||
public long? sizeInBytes; | ||
|
@@ -71,11 +179,15 @@ public long? SizeInBytes | |
{ | ||
this.sizeInBytes = (long)this.contents.Length; | ||
} | ||
else if (this.bytes != null) | ||
{ | ||
this.sizeInBytes = (int)this.bytes.Length; | ||
} | ||
else if (this.Stream != null) | ||
{ | ||
this.SizeInBytes = (long)this.Stream.Length; | ||
} | ||
else if (Uri!.IsAbsoluteUri && Uri!.IsFile) | ||
else if (Uri.IsAbsoluteUri && Uri.IsFile) | ||
{ | ||
this.sizeInBytes = (long)FileSystem.FileInfoLength(Uri.LocalPath); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
...SizeInBytes?.Value ?? 0
SizeInBytes may be null in some weird condition. #WontFix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, and we should not provoke those conditions. :) There is a unit test for this. What actually happens is that we will raise
InvalidOperationException
for the weird cases, so the nullness operator doesn't help.In our code, however, we always provide a URI and contents and so there shouldn't be a problem. You do not want to code defensively in these cases because if you do so you'll mask an issue that you'd assumed, by design, is protected against.
That can lead to very subtle bugs. Better is to have the code as it is. We think our objects have integrity, if not, we will fail fast with an exception.