From 3d7d1fd0a9d83ae68efc7e81a0360e78eac017ee Mon Sep 17 00:00:00 2001 From: Devis Lucato Date: Wed, 1 May 2024 14:22:53 -0700 Subject: [PATCH] Fix bug 447: MsExcelDecoder.DecodeAsync only works on text data types (#450) See #447 Fix Excel decoder to better support cell types and export numbers and other values. The solution is not perfect for Dates, Currencies and Percentages, due to limitations of the underlying lib and more investigation required to work around these. --- .../TikTokenTokenizersTest.cs | 2 + .../Core/DataFormats/Office/MsExcelDecoder.cs | 48 ++++++++++++++++-- .../Office/MsExcelDecoderConfig.cs | 17 ++++--- .../Core.FunctionalTests.csproj | 4 ++ .../DataFormats/Office/MsExcelDecoderTest.cs | 46 +++++++++++++++++ .../Core.FunctionalTests/file3-data.xlsx | Bin 0 -> 10189 bytes 6 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 service/tests/Core.FunctionalTests/DataFormats/Office/MsExcelDecoderTest.cs create mode 100644 service/tests/Core.FunctionalTests/file3-data.xlsx diff --git a/extensions/TikToken/TikToken.UnitTests/TikTokenTokenizersTest.cs b/extensions/TikToken/TikToken.UnitTests/TikTokenTokenizersTest.cs index 54285e7a6..4de9f1129 100644 --- a/extensions/TikToken/TikToken.UnitTests/TikTokenTokenizersTest.cs +++ b/extensions/TikToken/TikToken.UnitTests/TikTokenTokenizersTest.cs @@ -15,6 +15,8 @@ public TikTokenTokenizers(ITestOutputHelper output) : base(output) } [Fact] + [Trait("Category", "UnitTest")] + [Trait("Category", "AI")] public void TheyCountTokens() { const string text = "{'bos_token': '<|endoftext|>',\n 'eos_token': '<|endoftext|>',\n 'unk_token': '<|endoftext|>'}"; diff --git a/service/Core/DataFormats/Office/MsExcelDecoder.cs b/service/Core/DataFormats/Office/MsExcelDecoder.cs index 2b56b590b..49d69ccee 100644 --- a/service/Core/DataFormats/Office/MsExcelDecoder.cs +++ b/service/Core/DataFormats/Office/MsExcelDecoder.cs @@ -73,12 +73,54 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation { IXLCell? cell = cells[i]; + /* Note: some data types are not well supported; for example the values below + * are extracted incorrectly regardless of the cell configuration. + * In this cases using Text cell type might be better. + * + * - Date: "Monday, December 25, 2090" => "69757" + * - Time: "12:55:00" => "0.5381944444444444" + * - Time: "12:55" => "12/31/1899" + * - Currency symbols are not extracted + */ if (this._config.WithQuotes) { sb.Append('"'); - sb.Append(cell is { Value.IsText: true } - ? cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal) - : this._config.BlankCellValue); + if (cell == null || cell.Value.IsBlank) + { + sb.Append(this._config.BlankCellValue); + } + else if (cell.Value.IsTimeSpan) + { + sb.Append(cell.Value.GetTimeSpan().ToString(this._config.TimeSpanFormat, this._config.TimeSpanProvider)); + } + else if (cell.Value.IsDateTime) + { + // TODO: check cell.Style.DateFormat.Format + sb.Append(cell.Value.GetDateTime().ToString(this._config.DateFormat, this._config.DateFormatProvider)); + } + else if (cell.Value.IsBoolean) + { + sb.Append(cell.Value.GetBoolean() ? this._config.BooleanTrueValue : this._config.BooleanFalseValue); + } + else if (cell.Value.IsText) + { + var value = cell.Value.GetText().Replace("\"", "\"\"", StringComparison.Ordinal); + sb.Append(string.IsNullOrEmpty(value) ? this._config.BlankCellValue : value); + } + else if (cell.Value.IsNumber) + { + // TODO: check cell.Style.NumberFormat.Format and cell.Style.DateFormat.Format to detect dates, currency symbols, phone numbers + sb.Append(cell.Value.GetNumber()); + } + else if (cell.Value.IsUnifiedNumber) + { + sb.Append(cell.Value.GetUnifiedNumber()); + } + else if (cell.Value.IsError) + { + sb.Append(cell.Value.GetError().ToString().Replace("\"", "\"\"", StringComparison.Ordinal)); + } + sb.Append('"'); } else diff --git a/service/Core/DataFormats/Office/MsExcelDecoderConfig.cs b/service/Core/DataFormats/Office/MsExcelDecoderConfig.cs index 5588fddce..57a6a61d5 100644 --- a/service/Core/DataFormats/Office/MsExcelDecoderConfig.cs +++ b/service/Core/DataFormats/Office/MsExcelDecoderConfig.cs @@ -1,24 +1,25 @@ // Copyright (c) Microsoft. All rights reserved. +using System; +using System.Globalization; + namespace Microsoft.KernelMemory.DataFormats.Office; public class MsExcelDecoderConfig { public bool WithWorksheetNumber { get; set; } = true; - public bool WithEndOfWorksheetMarker { get; set; } = false; - public bool WithQuotes { get; set; } = true; - public string WorksheetNumberTemplate { get; set; } = "\n# Worksheet {number}\n"; - public string EndOfWorksheetMarkerTemplate { get; set; } = "\n# End of worksheet {number}"; - public string RowPrefix { get; set; } = string.Empty; - public string ColumnSeparator { get; set; } = ", "; - public string RowSuffix { get; set; } = string.Empty; - public string BlankCellValue { get; set; } = string.Empty; + public string BooleanTrueValue { get; set; } = "TRUE"; + public string BooleanFalseValue { get; set; } = "FALSE"; + public string TimeSpanFormat { get; set; } = "g"; + public IFormatProvider TimeSpanProvider { get; set; } = CultureInfo.CurrentCulture; + public string DateFormat { get; set; } = "d"; + public IFormatProvider DateFormatProvider { get; set; } = CultureInfo.CurrentCulture; } diff --git a/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj b/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj index b44950561..7fccfc4f7 100644 --- a/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj +++ b/service/tests/Core.FunctionalTests/Core.FunctionalTests.csproj @@ -45,6 +45,10 @@ Always + + + Always + diff --git a/service/tests/Core.FunctionalTests/DataFormats/Office/MsExcelDecoderTest.cs b/service/tests/Core.FunctionalTests/DataFormats/Office/MsExcelDecoderTest.cs new file mode 100644 index 000000000..ca8d46388 --- /dev/null +++ b/service/tests/Core.FunctionalTests/DataFormats/Office/MsExcelDecoderTest.cs @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.KernelMemory.DataFormats; +using Microsoft.KernelMemory.DataFormats.Office; +using Microsoft.TestHelpers; +using Xunit.Abstractions; + +namespace Microsoft.Core.FunctionalTests.DataFormats.Office; + +public class MsExcelDecoderTest : BaseFunctionalTestCase +{ + public MsExcelDecoderTest(IConfiguration cfg, ITestOutputHelper output) : base(cfg, output) + { + } + + [Fact] + [Trait("Category", "UnitTest")] + [Trait("Category", "DataFormats")] + public async Task ItExtractsAllTypes() + { + // Arrange + const string file = "file3-data.xlsx"; + var decoder = new MsExcelDecoder(); + + // Act + FileContent result = await decoder.DecodeAsync(file); + string content = result.Sections.Aggregate("", (current, s) => current + (s.Content + "\n")); + Console.WriteLine(content); + + // Assert + Assert.Contains("\"0.5\"", content); // 50% percentage + Assert.Contains("\"512.99\"", content); // number + Assert.Contains("\"3.99999999\"", content); // number + Assert.Contains("\"0.25\"", content); // fraction + Assert.Contains("\"123.6\"", content); // currency + Assert.Contains("\"4518\"", content); // currency + Assert.Contains("\"444666\"", content); // currency + Assert.Contains("\"United States of America\"", content); // text + Assert.Contains("\"Rome\", \"\", \"Tokyo\"", content); // text with empty columns + Assert.Contains("\"1/12/2009\"", content); // date + Assert.Contains("\"12/25/2090\"", content); // date + Assert.Contains("\"98001\"", content); // zip code + Assert.Contains("\"15554000600\"", content); // phone number + Assert.Contains("\"TRUE\"", content); // boolean + } +} diff --git a/service/tests/Core.FunctionalTests/file3-data.xlsx b/service/tests/Core.FunctionalTests/file3-data.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..c3bfe42f4d56ff3817c139c0c652a9c6045de606 GIT binary patch literal 10189 zcmeHtg;!il_H_dd!QCN20t9yt?!h6rTcZt)J56wx5L^QU*Py}O-3byT5Zo>J$IHx{ zH!m~aU+|miTKBGA_wLiT>z;FV)jp*x2M5FjAOH{n000HROz&_pozxIE41WMzF6uVe4C9h=e#5b5^m#Rb% zxeomMu<2EV+PgpW7Mtp3T3J12hTUR{W#d`%e8wJI@?t+3v8=MQt@97FL|2ZX+OybJWp3`iM&+!bW6h7-gOYNOq|2QstCb6O}C`w#361diOnJp-%&A zrMCgKSf`M$f3ag#QR5pH?_kjs=JyGlX=^JwFN3r1OmBb@lwM2tt_FF^6k2cTOjH{j zZ<6F%ooK&s%ov%%eq#-eZ9!iiT|sDS<$0mNNnBMFeqOB88*9VR=B?-L*+v^3ajtyR z$c<-d$fTPo;z!{>IS){_iKxLZW49ag=@!Pzl^;6pBwxOzUc({nULi4S%#4XOQzWFc zWn9hc%al;!m_o&(T>Ws0<{2d!&M z41LL$!fd&Lm6uXeCla4A$Cx#Vk#O=zgYgAZ{B(L1w7;4BEQgs9dwWn8Tv^ZaareVm zn(tg<{tlXOD4+cPWC~utlZnOGa_@dH#lF2 zVtT(SB`5Yz_A%)`>W>$`IyIapLm3|3JV;$7{l^3T5oCqmo*3#sgT!ovT`UU@0Jwjm zGK{B@aR;-wK^$#NAP}2Bc&k)hACk+9>tks32)CZG@Ppx9YIJ8(o$Bl2N%^K*wT4w( z-1bs7g<|W6TRm}^#R2Pt1F!@bY)F>nb=2_VZm7Jx#)76Xj>jn@-B{mLq4%uHBXw^@ zqiAhM=atomsc;@}d0VC8qvOdfWJ{NsJ0=~%39xKzFp&9*fas##VZD%XlE_tA^&=-A zJ9r(*rOb%)2)Wlxp905}V5SCqjzxi4V03)ZIIJ~X-D)01Dhezq1m5VY)GY^USQNC+ zsDW>0``wpeyG^wdPCQR9wv z`99Q|BGPPrlgaM51$o1NZOR*q zl~a~)Wx0?`VBE_dkPvxc;KOq+1IB`TbuxUczsPgPs)#xiGu6!#w#O8Av>It25IJRl z@0CKk-C!3$6P;MTW2ZyGyFGLl)yweh5q0KqTHa`_kc+vQvbr=N0e^$v(wDn-L`4%Q{06hQ9B{y{BLX)A!Bt)Y7m~@fNJW(OC zhcYaZR+fS>`}uokZcMQwLNp#0Rua!LqT)@Bj|@m{f6*T?eH6 zyqE-4(7$ie__fE(K!nq_XfpNqOKyYVN{F~8V_&7<3a7v#dsKh)NOuag12#Xs;O-FC z9of~EGFiv@8&yUW`VHA}FdSJGwQfiaoM03>%-EHgcpu7Vc-({dPr(Y_;J|Df62)|# z(a3I<^-mD9gxfu(Zw4=jT{-Bz3c>2rUA)S+jah>7<$Ym3nL_afm67V^mrai&$N0?^ z^I^R^%VEJ2b9*xvVRf1VyK6#%Gwu%0dn2X}2!=M-E7VW7oPjre?{X;Cil3YDFMTLx zWQIOP>VL+vpHO_#+0&KAC&D8JAi_Mw@;^B5-y`}@{)2g%jy`3C|J$upRY9(s6|)89 zA%xW}%@q&pt1}DbzWN>p;y?}EA`Jxx^n8VczENLmR*nS@66|p_(&u``g}x4lbJ4+2 z7>*6}!~q|@hKG!tjldzA9F)EAmxjl|-rwCTIY2>Ab;4;7itA_0#O3|*la`t&BsZT@ z;=ExpnA3Wc)rH;KY5e)%Q~n>S*?4yo_;$a{zox#PJeAIT^RYpE@e(1+--B%8j0owC z0GbD2Vtbl`;b({BCE{uGoweN7lBzreqS(xg@v|d7v!RE%|oCickz*JkmN_Hmp!%M`DWylV1$kHljv+0}3{S#185kFqS`#S0 z(y2U@x5}#S{VKbTNkY7bahJ#DGuLy4Q(gPGoe{gm@3MASpD`0E`w(OW2CBEz%Q$t_ zlCS(OiLTIzD-e&qpiGq-O2+xyfTkyJWYBd+=5`81iCLjE`dhv@!IyAoES*H|qLPpg z63;9&oc($rfz%es3`wJ|Il}`L?y0!d*WQ!4)g}wM0niB&sel^Iz_S5PqT$Jq^PPl9 z={%7TeT~dukYc9g<$D?`!kAQq8)hTOL$X&QQ>zI45-dV?x6^%tvh{avq3^1pGE-oC zncgyYKO{AEMdT%0D7PqbDxPiZDk(^yakwe;EWbq93etfpuXZ6G?;2%%avoo4)-(v! zxWypXu!$%^GCx{)#SrtBmGl;W^XL1apc3SipXZZMTnQ1^M{ImU5l^c|3O+gmd26Zx zh{Ed1tmpBl8y$i}(?W}MZ%i7*c3<+TjNlc)c<(Gix^u@Z)cZckEf?97n>#E-E4`fd zF%Fy4=Y-@B&->+m@kxtzi%;_{L+ayulYvB~ARMDioU3zeD|{nOTa{&;UUE7<3|j6yPjyP3A6d}=EG2{ z@Py!KORLpvP3A1~Bln)d_u^J=_Pn2}rKlWq^@rU-48CXLVv=)or@iC~-1jXsBy?x`*jUT|@- zs#vH_ss5h0ybjRp3zCd0X>bqD*ra}AzOR81YopVqMY1#6(Bo29$Ln?`f)i}S(I_Yy zt?t*IuT9>wbXyi_&PM~s>mR&f3@4tB|Y`RA{LK%^*g=hGU! zf_)gKlzct^V6coq3DYGNbyhwZiYL%ggz5I!f>FUjF-Ye=-8{~SgYeA`UA~;H7L7r; z$)b_aC`Rf{r-FCh^C`1ngys2)JCRmcBlo7^kZbCCS++lvJB<=1>lQzMve4 z9J8w0_f(?!F1skMx@Hzlq$D$^i?ie7c=SEya2*cseKL8h(gw_-=X=#1(OP;7*2vB0 znQ=gO>-Pn&iWn}R_6D&e${4&!#@$1shZbm&0Id6@ZCm|v!4>yM#(XVrjyE;jfgCYB?UeNiYX~t$4(9a*}twOuC zo$rHqK88+gcoQsxYZpgITY?yQUpsfc9?4%J{&5ToKT#RQEZF1!%G$8B=O+}^#$N9eji@fQhk`nSPzFJv7$sZno@F-5j#?(J_OjL{f~%x`QA+9eKAt%PJo zfePw#UI3jI?2S#Hm1g&ci-@g9kCPq(mdr<#=M$h$?COb26pD$U2C|2TzQ_9ux9zsa zheZs-hk>Xh&|Qh|!^2Gd%6+`2mUR%F!NYZlrlIe}DU0vV2Qowcnk!&t;u;PyPMBgp zwdb&@yND@rns7>Z+8cRFc_t)MKf93K_u-7I1H>Ez{F{UD!apvN zC$3hXrSH33y_IEjYZJl}@B?b*s{unFK-8qI%}QTdUd)00!Wf3mCBUG$%#Wkn+ZBl# zmSk9+$JCiP{+ZspM&nyWpfmsT0K}?63J3UEIB;Kat5Rqn`l~_gUVs&+`S>z-GIkL$!7jXJz+p^DWRSQqU>JdXH7SG`4fRzgLaYo>gF3i`<+Kcx{(a?VT5MtH2D`=HQono;wEboG@4 zwrdEqk)@um@(1?9vv3%JZFe5lSr{4@U+~)GTB=zpNDWFHFV~Bhqd7Ns6{zMs5f22vP4N|`R^OUtT1oHwk_LM_`6SofZY z`(Djaq25ZUY~567Y#}4RZNq{lUJLI#6pmQwD{oD)TCiQ+sptqqe7RCTRbwysU1AGT z*rXYESZaWT4aZMn>_(?csFpJ+N|#ZX*WfT~`but`b1g9iSox zFrGcng-d~bk)%?ZNi{JtaoJ_aajhcdoFkmvV2VHWHukJLV#KI8N4jeZ{CU(GyCQ}A zC+Jlt7%7B>gSrauk-LZ*PZjNRBzr_!>4)oA`QeD&mP;4sXOJ#GYML;!4gN}J-=Yn% zmQ|?LM*Ch}(twGmdGY?$WN4kUD+-;E9R8?b)X&9$S}H@orm>uB>eQ+P_ehFv;xZ90 zH$`evGGI!{7Vc*_~g(&)aZ z%1Kf{`Qa6DfC|imy;_~hP8%O;*$h?3bTy*$~zr27#PBtC}R{DAr+eG*}6IO^pd-h(%B)Q z{UFgE#tUfM=v`(AeUzkM3NF>f?59(M80ej`)(+Q8O(!u6^c-Q;fDt;Z5!`5Yq(P~iY9fHn zqI9Lfwhojatsg7I+ElEa`7<5E$X+oIu4D<_o{Ru%4Qh4PR0xGwF}T%yF2=pDs$F9+ zwCme(&JTmM85=!}zUjmJn%qzwG#%y7SSxt{w91e-NWB~*KS?|X7&y{~unl%UdwDJC za5{AF=B^>C82y9}EJ6JF6%j8nL%tJV1;7?2QWg<9!ouDIbK#YbGZlP*qE>M^8+3FoX zpYi_S#-{Kf+`hVYgkJw~?+^`MG7W8&6~Dk7+fNtH5dDBC@!0FO?>XuXx~th=X&eue zta(V*di={2Vv`S!JPRU6fv=)DiPvojM)GpV{AL1`(7yvmVR=tDM{IC&kBjeU5;7Gc zw^f*%Ow)&`v#=AsPF-n7;R>4z`>?Dn+|$Srxa?Eh)EeCx^hWSsmfDrNUnWD!Pg0o- za=Yp#kSb`>*-SX1>AjZ0U`9?|eoYI`S@&i0S8?TK(Zn`_nxS0k%{TWO69XF$CCg4r z!M13C`Uf2=2W72&Ss>L%R=E~Ar)4wc0Xw^E%f?UgZyl6WV3K>R4fE%B71KgUbldke z{LFM?u~|}DzoF%SYrSWn+l~v^;nr)L&@Ez;*gaT;C~)Dq$Wu#c+}OgybKT|UE)g^99m-`s+|p_vz5yaKl+m)_@SG18Z{DB+QXf`x$F=s zpEZ>7N)HsMgplgkSvY@Vy_e08mkwO^RiT{sq?{L~j(+&B;y&v{%K+ok6}Klr@F}bO zq3$?Y8asl_-a0#4*;)L`Ea12iyQkGo;8DU2I>SoGU1;4$s=~aSdgKKAwK^Ak6Rx_q zUT-@V=kR0_NWB*!0+ajmUr)`5e^QXuR5uV}NW=EiYg%~?_mt~lWCDRwy2jrbH`nDw z86$3+7=gjtwG;$gPEE1wg{i2)*){`Go2$*D(YeBdpG0lpY3H;@ zd(KrvEH2!nOHc^MB;Uj9o9TJ;il=N)e(*gDN;urxSN&3*TIA34@-4vaorSmo!+y$6 zH=Wl@xff};kbZ`$$UX!At5NSbrK1Y|BU7KP__kgdI$bX{QeCC%2q!dS45g5>y*FsO zd8Sf6Tire(7gk%~$E(S4(nLn{Q@22ncn#=%uN` zRRG~UIjSL~Es1;fez1GV*rj&38{4S(OixZ0X){OyH z=Kt32T|+N!iP49(H_OZA=Cii1uwqCMio+5sT-l+V6t@Mzcsbalc%CHaG zgkxgtEFa=~Ib~<@z^h>iTSgIeXN4t{ak+RK12fW3aVZ%{Zyo1(%X=lG{0-Xlh zh_O$eNAgs-!~A#8Gq$(?pW{7w+&^1-oC4$zx$y|@h75I?gwIw5)vwe!VfTH{ZbIC&HmgPV4WD*E8(v;8r-DMtOwzly17bo;M;sB$E_ur(IN>le(?F zAY7UAFi=~>$DX}@rno1BH*NaO%rRh~5x%YOM*qysls6!0XZGUMRmC9b+R%cFTDV;2 z@n8EhU|^Y^nCQQ6oci;v{dxW`+o+V~{%+v!TRi>*{&7xv3XOl;@bNqF_pBjFC{!XHQ;Q@fFCrS1{DD`*v-!tLA h!yTUV(tpAKnHQDi;Gf