Skip to content

Commit

Permalink
Fix bug 447: MsExcelDecoder.DecodeAsync only works on text data types (
Browse files Browse the repository at this point in the history
…#450)

See #447

Fix Excel decoder to better support cell types and export numbers and
other values.
The solution is not perfect for Dates, Currencies and Percentages, due
to limitations of the underlying lib and more investigation required to
work around these.
  • Loading branch information
dluc authored May 1, 2024
1 parent 473e6ee commit 3d7d1fd
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ public TikTokenTokenizers(ITestOutputHelper output) : base(output)
}

[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "AI")]
public void TheyCountTokens()
{
const string text = "{'bos_token': '<|endoftext|>',\n 'eos_token': '<|endoftext|>',\n 'unk_token': '<|endoftext|>'}";
Expand Down
48 changes: 45 additions & 3 deletions service/Core/DataFormats/Office/MsExcelDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,54 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
{
IXLCell? cell = cells[i];

/* Note: some data types are not well supported; for example the values below
* are extracted incorrectly regardless of the cell configuration.
* In this cases using Text cell type might be better.
*
* - Date: "Monday, December 25, 2090" => "69757"
* - Time: "12:55:00" => "0.5381944444444444"
* - Time: "12:55" => "12/31/1899"
* - Currency symbols are not extracted
*/
if (this._config.WithQuotes)
{
sb.Append('"');
sb.Append(cell is { Value.IsText: true }
? cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal)
: this._config.BlankCellValue);
if (cell == null || cell.Value.IsBlank)
{
sb.Append(this._config.BlankCellValue);
}
else if (cell.Value.IsTimeSpan)
{
sb.Append(cell.Value.GetTimeSpan().ToString(this._config.TimeSpanFormat, this._config.TimeSpanProvider));
}
else if (cell.Value.IsDateTime)
{
// TODO: check cell.Style.DateFormat.Format
sb.Append(cell.Value.GetDateTime().ToString(this._config.DateFormat, this._config.DateFormatProvider));
}
else if (cell.Value.IsBoolean)
{
sb.Append(cell.Value.GetBoolean() ? this._config.BooleanTrueValue : this._config.BooleanFalseValue);
}
else if (cell.Value.IsText)
{
var value = cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal);
sb.Append(string.IsNullOrEmpty(value) ? this._config.BlankCellValue : value);
}
else if (cell.Value.IsNumber)
{
// TODO: check cell.Style.NumberFormat.Format and cell.Style.DateFormat.Format to detect dates, currency symbols, phone numbers
sb.Append(cell.Value.GetNumber());
}
else if (cell.Value.IsUnifiedNumber)
{
sb.Append(cell.Value.GetUnifiedNumber());
}
else if (cell.Value.IsError)
{
sb.Append(cell.Value.GetError().ToString().Replace("\"", "\"\"", StringComparison.Ordinal));
}

sb.Append('"');
}
else
Expand Down
17 changes: 9 additions & 8 deletions service/Core/DataFormats/Office/MsExcelDecoderConfig.cs
Original file line number Diff line number Diff line change
@@ -1,24 +1,25 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Globalization;

namespace Microsoft.KernelMemory.DataFormats.Office;

public class MsExcelDecoderConfig
{
public bool WithWorksheetNumber { get; set; } = true;

public bool WithEndOfWorksheetMarker { get; set; } = false;

public bool WithQuotes { get; set; } = true;

public string WorksheetNumberTemplate { get; set; } = "\n# Worksheet {number}\n";

public string EndOfWorksheetMarkerTemplate { get; set; } = "\n# End of worksheet {number}";

public string RowPrefix { get; set; } = string.Empty;

public string ColumnSeparator { get; set; } = ", ";

public string RowSuffix { get; set; } = string.Empty;

public string BlankCellValue { get; set; } = string.Empty;
public string BooleanTrueValue { get; set; } = "TRUE";
public string BooleanFalseValue { get; set; } = "FALSE";
public string TimeSpanFormat { get; set; } = "g";
public IFormatProvider TimeSpanProvider { get; set; } = CultureInfo.CurrentCulture;
public string DateFormat { get; set; } = "d";
public IFormatProvider DateFormatProvider { get; set; } = CultureInfo.CurrentCulture;
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
<Content Include="file2-largePDF.pdf">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="file3-data.xlsx" />
<Content Include="file3-data.xlsx">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.DataFormats.Office;
using Microsoft.TestHelpers;
using Xunit.Abstractions;

namespace Microsoft.Core.FunctionalTests.DataFormats.Office;

public class MsExcelDecoderTest : BaseFunctionalTestCase
{
public MsExcelDecoderTest(IConfiguration cfg, ITestOutputHelper output) : base(cfg, output)
{
}

[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "DataFormats")]
public async Task ItExtractsAllTypes()
{
// Arrange
const string file = "file3-data.xlsx";
var decoder = new MsExcelDecoder();

// Act
FileContent result = await decoder.DecodeAsync(file);
string content = result.Sections.Aggregate("", (current, s) => current + (s.Content + "\n"));
Console.WriteLine(content);

// Assert
Assert.Contains("\"0.5\"", content); // 50% percentage
Assert.Contains("\"512.99\"", content); // number
Assert.Contains("\"3.99999999\"", content); // number
Assert.Contains("\"0.25\"", content); // fraction
Assert.Contains("\"123.6\"", content); // currency
Assert.Contains("\"4518\"", content); // currency
Assert.Contains("\"444666\"", content); // currency
Assert.Contains("\"United States of America\"", content); // text
Assert.Contains("\"Rome\", \"\", \"Tokyo\"", content); // text with empty columns
Assert.Contains("\"1/12/2009\"", content); // date
Assert.Contains("\"12/25/2090\"", content); // date
Assert.Contains("\"98001\"", content); // zip code
Assert.Contains("\"15554000600\"", content); // phone number
Assert.Contains("\"TRUE\"", content); // boolean
}
}
Binary file not shown.

0 comments on commit 3d7d1fd

Please sign in to comment.