Skip to content

Commit

Permalink
Fixe's for compress & overwrite test's
Browse files Browse the repository at this point in the history
Signed-off-by: Ilhan Gelle <[email protected]>
  • Loading branch information
ilhan2316 committed Aug 16, 2024
1 parent fb1c62b commit 27da5da
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 65 deletions.
59 changes: 59 additions & 0 deletions testing/regress/ecl/key/parquet_compress.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
<Dataset name='BooleanData'>
<Row><testid>0</testid><testname>aaa</testname><value>true</value></Row>
<Row><testid>1</testid><testname>aab</testname><value>false</value></Row>
<Row><testid>2</testid><testname>aac</testname><value>true</value></Row>
<Row><testid>3</testid><testname>aad</testname><value>false</value></Row>
<Row><testid>4</testid><testname>aae</testname><value>true</value></Row>
<Row><testid>0</testid><testname>aaa</testname><value>false</value></Row>
<Row><testid>1</testid><testname>aab</testname><value>false</value></Row>
<Row><testid>2</testid><testname>aac</testname><value>true</value></Row>
Expand All @@ -7,18 +12,52 @@
<Row><testid>10</testid><testname>aai</testname><value>123</value></Row>
<Row><testid>11</testid><testname>aaj</testname><value>-987</value></Row>
<Row><testid>12</testid><testname>aak</testname><value>0</value></Row>
<Row><testid>300</testid><testname>afa</testname><value>32767</value></Row>
<Row><testid>301</testid><testname>afb</testname><value>2147483647</value></Row>
<Row><testid>302</testid><testname>afc</testname><value>9223372036854775807</value></Row>
<Row><testid>10</testid><testname>aai</testname><value>123</value></Row>
<Row><testid>11</testid><testname>aaj</testname><value>-987</value></Row>
<Row><testid>12</testid><testname>aak</testname><value>456</value></Row>
<Row><testid>13</testid><testname>aal</testname><value>789</value></Row>
<Row><testid>14</testid><testname>aam</testname><value>-321</value></Row>
<Row><testid>340</testid><testname>afp</testname><value>127</value></Row>
<Row><testid>341</testid><testname>afq</testname><value>-128</value></Row>
<Row><testid>342</testid><testname>afr</testname><value>0</value></Row>
</Dataset>
<Dataset name='RealData'>
<Row><testid>20</testid><testname>aas</testname><value>3.14</value></Row>
<Row><testid>21</testid><testname>aat</testname><value>-0.5</value></Row>
<Row><testid>22</testid><testname>aau</testname><value>123.456</value></Row>
<Row><testid>170</testid><testname>adk</testname><value>1.23</value></Row>
<Row><testid>171</testid><testname>adl</testname><value>-9.869999999999999</value></Row>
<Row><testid>172</testid><testname>adm</testname><value>3.14159265358979</value></Row>
<Row><testid>173</testid><testname>adn</testname><value>2.71828182845904</value></Row>
<Row><testid>174</testid><testname>ado</testname><value>-1.41421356237309</value></Row>
<Row><testid>320</testid><testname>afg</testname><value>1.230000019073486</value></Row>
<Row><testid>321</testid><testname>afh</testname><value>-9.869999885559082</value></Row>
<Row><testid>322</testid><testname>afi</testname><value>3.141590118408203</value></Row>
<Row><testid>30</testid><testname>aas</testname><value>1.23</value></Row>
<Row><testid>31</testid><testname>aat</testname><value>-9.869999999999999</value></Row>
<Row><testid>32</testid><testname>aau</testname><value>45.67</value></Row>
<Row><testid>33</testid><testname>aav</testname><value>78.90000000000001</value></Row>
<Row><testid>34</testid><testname>aaw</testname><value>-32.1</value></Row>
</Dataset>
<Dataset name='DecimalData'>
<Row><testid>30</testid><testname>abc</testname><value>123.456789</value></Row>
<Row><testid>31</testid><testname>abd</testname><value>-987.6543209999999</value></Row>
<Row><testid>32</testid><testname>abe</testname><value>0.000001</value></Row>
<Row><testid>40</testid><testname>aax</testname><value>12.34</value></Row>
<Row><testid>41</testid><testname>aay</testname><value>-56.78</value></Row>
<Row><testid>42</testid><testname>aaz</testname><value>90.12</value></Row>
<Row><testid>43</testid><testname>aba</testname><value>34.56</value></Row>
<Row><testid>44</testid><testname>abb</testname><value>-78.9</value></Row>
</Dataset>
<Dataset name='StringData'>
<Row><testid>50</testid><testname>abc</testname><value>Hello</value></Row>
<Row><testid>51</testid><testname>abd</testname><value>World</value></Row>
<Row><testid>52</testid><testname>abe</testname><value>Test</value></Row>
<Row><testid>53</testid><testname>abf</testname><value>String</value></Row>
<Row><testid>54</testid><testname>abg</testname><value>Types</value></Row>
<Row><testid>40</testid><testname>abm</testname><value>Hello, World!</value></Row>
<Row><testid>41</testid><testname>abn</testname><value>Data Science</value></Row>
<Row><testid>42</testid><testname>abo</testname><value>12345</value></Row>
Expand All @@ -27,13 +66,28 @@
<Row><testid>50</testid><testname>abw</testname><value>THIS IS A &quot;Q&quot; STRING.</value></Row>
<Row><testid>51</testid><testname>abx</testname><value>ANOTHER &quot;EXAMPLE&quot; HERE.</value></Row>
<Row><testid>52</testid><testname>aby</testname><value>QSTRINGS ARE USEFUL!</value></Row>
<Row><testid>80</testid><testname>abr</testname><value>QSTR1</value></Row>
<Row><testid>81</testid><testname>abs</testname><value>QSTR2</value></Row>
<Row><testid>82</testid><testname>abt</testname><value>QSTR3</value></Row>
<Row><testid>83</testid><testname>abu</testname><value>QSTR4</value></Row>
<Row><testid>84</testid><testname>abv</testname><value>QSTR5</value></Row>
</Dataset>
<Dataset name='UnicodeData'>
<Row><testid>60</testid><testname>acg</testname><value>こんにちは、世界!</value></Row>
<Row><testid>61</testid><testname>ach</testname><value>Unicode characters: ḸḹḾ</value></Row>
<Row><testid>62</testid><testname>aci</testname><value>Ṏ Beautiful Unicode Ṙ</value></Row>
<Row><testid>100</testid><testname>acb</testname><value>Unicode1</value></Row>
<Row><testid>101</testid><testname>acc</testname><value>Unicode2</value></Row>
<Row><testid>102</testid><testname>acd</testname><value>Unicode3</value></Row>
<Row><testid>103</testid><testname>ace</testname><value>Unicode4</value></Row>
<Row><testid>104</testid><testname>acf</testname><value>Unicode5</value></Row>
</Dataset>
<Dataset name='UTF8Data'>
<Row><testid>90</testid><testname>abw</testname><value>UTF8_1</value></Row>
<Row><testid>91</testid><testname>abx</testname><value>UTF8_2</value></Row>
<Row><testid>92</testid><testname>aby</testname><value>UTF8_3</value></Row>
<Row><testid>93</testid><testname>abz</testname><value>UTF8_4</value></Row>
<Row><testid>94</testid><testname>aca</testname><value>UTF8_5</value></Row>
<Row><testid>70</testid><testname>acq</testname><value>Café au lait ☕</value></Row>
<Row><testid>71</testid><testname>acr</testname><value>🎉 UTF-8 Characters 🎉 </value></Row>
<Row><testid>72</testid><testname>acs</testname><value>Special characters: ©®™</value></Row>
Expand All @@ -42,6 +96,11 @@
<Row><testid>80</testid><testname>ada</testname><value>01A48D8414D848E900</value></Row>
<Row><testid>81</testid><testname>adb</testname><value>01F48AB446A76F8923</value></Row>
<Row><testid>82</testid><testname>adc</testname><value>01A48EC793A76F9400</value></Row>
<Row><testid>60</testid><testname>abh</testname><value>01234567C289C2ABC38DC3AF</value></Row>
<Row><testid>61</testid><testname>abi</testname><value>C3BEC39CC2BAC29876543210</value></Row>
<Row><testid>62</testid><testname>abj</testname><value>00C3BF00C3BF00C3BF00C3BF</value></Row>
<Row><testid>63</testid><testname>abk</testname><value>C3BF00C3BF00C3BF00C3BF00</value></Row>
<Row><testid>64</testid><testname>abl</testname><value>12345678C290C2ABC38DC3AF</value></Row>
</Dataset>
<Dataset name='VarstringData'>
<Row><testid>90</testid><testname>adk</testname><value>Short text</value></Row>
Expand Down
88 changes: 27 additions & 61 deletions testing/regress/ecl/parquet_compress.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -24,67 +24,33 @@ compressionType := #IFDEFINED(root.compressionType, 'Snappy');

IMPORT Parquet;

// Define datasets
BooleanData := DATASET([{000, 'aaa', 0},
{001, 'aab', false},
{002, 'aac', 1}], {UNSIGNED testid, STRING3 testname, BOOLEAN value});

IntegerData := DATASET([{010, 'aai', 123},
{011, 'aaj', -987},
{012, 'aak', 0}], {UNSIGNED testid, STRING3 testname, INTEGER value});

RealData := DATASET([{020, 'aas', 3.14},
{021, 'aat', -0.5},
{022, 'aau', 123.456}], {UNSIGNED testid, STRING3 testname, REAL value});

DecimalData := DATASET([{030, 'abc', 123.456789},
{031, 'abd', -987.654321},
{032, 'abe', 0.000001}], {UNSIGNED testid, STRING3 testname, DECIMAL value});

StringData := DATASET([{040, 'abm', 'Hello, World!'},
{041, 'abn', 'Data Science'},
{042, 'abo', '12345'}], {UNSIGNED testid, STRING3 testname, STRING value});

QStringData := DATASET([{050, 'abw', 'This is a "Q" string.'},
{051, 'abx', 'Another "example" here.'},
{052, 'aby', 'Qstrings are useful!'}], {UNSIGNED testid, STRING3 testname, QSTRING value});

UnicodeData := DATASET([{060, 'acg', U'こんにちは、世界!'},
{061, 'ach', U'Unicode characters: ḸḹḾ'},
{062, 'aci', U'Ṏ Beautiful Unicode Ṙ'}], {UNSIGNED testid, STRING3 testname, UNICODE value});

UTF8Data := DATASET([{070, 'acq', U'Café au lait ☕'},
{071, 'acr', U'🎉 UTF-8 Characters 🎉'},
{072, 'acs', U'Special characters: ©®™'}], {UNSIGNED testid, STRING3 testname, UTF8 value});

DataData := DATASET([{080, 'ada', x'01a48d8414d848e900'},
{081, 'adb', x'01f48ab446a76f8923'},
{082, 'adc', x'01a48ec793a76f9400'}], {UNSIGNED testid, STRING3 testname, DATA value});

VarstringData := DATASET([{090, 'adk', U'Short text'},
{091, 'adl', U'A longer variable-length string'},
{092, 'adm', U'Strings are flexible!'}], {UNSIGNED testid, STRING3 testname, VARSTRING value});

VarunicodeData := DATASET([{100, 'adu', U'Variable-length Unicode: こんにちは、世界!'},
{101, 'adv', U'🌟 Variable-length Unicode Symbols 🌟'},
{102, 'adw', U'Unicode flexibility is awesome!'}], {UNSIGNED testid, STRING3 testname, VARUNICODE value});

// Write datasets to Parquet files
PARALLEL(
ParquetIO.write(BooleanData, '/var/lib/HPCCSystems/mydropzone/Boolean.parquet', TRUE, compressionType),
ParquetIO.write(IntegerData, '/var/lib/HPCCSystems/mydropzone/Integer.parquet', TRUE, compressionType),
ParquetIO.write(RealData, '/var/lib/HPCCSystems/mydropzone/Real.parquet', TRUE, compressionType),
ParquetIO.write(DecimalData, '/var/lib/HPCCSystems/mydropzone/Decimal.parquet', TRUE, compressionType),
ParquetIO.write(StringData, '/var/lib/HPCCSystems/mydropzone/String.parquet', TRUE, compressionType),
ParquetIO.write(QStringData, '/var/lib/HPCCSystems/mydropzone/QString.parquet', TRUE, compressionType),
ParquetIO.write(UnicodeData, '/var/lib/HPCCSystems/mydropzone/Unicode.parquet', TRUE, compressionType),
ParquetIO.write(UTF8Data, '/var/lib/HPCCSystems/mydropzone/UTF8.parquet', TRUE, compressionType),
ParquetIO.write(DataData, '/var/lib/HPCCSystems/mydropzone/Data.parquet', TRUE, compressionType),
ParquetIO.write(VarstringData, '/var/lib/HPCCSystems/mydropzone/Varstring.parquet', TRUE, compressionType),
ParquetIO.write(VarunicodeData, '/var/lib/HPCCSystems/mydropzone/Varunicode.parquet', TRUE, compressionType)
);

// Output datasets
// Define record structures
BooleanRec := RECORD UNSIGNED testid; STRING3 testname; BOOLEAN value; END;
IntegerRec := RECORD UNSIGNED testid; STRING3 testname; INTEGER value; END;
RealRec := RECORD UNSIGNED testid; STRING3 testname; REAL value; END;
DecimalRec := RECORD UNSIGNED testid; STRING3 testname; DECIMAL value; END;
StringRec := RECORD UNSIGNED testid; STRING3 testname; STRING value; END;
QStringRec := RECORD UNSIGNED testid; STRING3 testname; QSTRING value; END;
UnicodeRec := RECORD UNSIGNED testid; STRING3 testname; UNICODE value; END;
UTF8Rec := RECORD UNSIGNED testid; STRING3 testname; UTF8 value; END;
DataRec := RECORD UNSIGNED testid; STRING3 testname; DATA value; END;
VarstringRec := RECORD UNSIGNED testid; STRING3 testname; VARSTRING value; END;
VarunicodeRec := RECORD UNSIGNED testid; STRING3 testname; VARUNICODE value; END;

// Read datasets from Parquet files
BooleanData := ParquetIO.Read(BooleanRec, '/var/lib/HPCCSystems/mydropzone/Boolean.parquet');
IntegerData := ParquetIO.Read(IntegerRec, '/var/lib/HPCCSystems/mydropzone/Integer.parquet');
RealData := ParquetIO.Read(RealRec, '/var/lib/HPCCSystems/mydropzone/Real.parquet');
DecimalData := ParquetIO.Read(DecimalRec, '/var/lib/HPCCSystems/mydropzone/Decimal.parquet');
StringData := ParquetIO.Read(StringRec, '/var/lib/HPCCSystems/mydropzone/String.parquet');
QStringData := ParquetIO.Read(QStringRec, '/var/lib/HPCCSystems/mydropzone/QString.parquet');
UnicodeData := ParquetIO.Read(UnicodeRec, '/var/lib/HPCCSystems/mydropzone/Unicode.parquet');
UTF8Data := ParquetIO.Read(UTF8Rec, '/var/lib/HPCCSystems/mydropzone/UTF8.parquet');
DataData := ParquetIO.Read(DataRec, '/var/lib/HPCCSystems/mydropzone/Data.parquet');
VarstringData := ParquetIO.Read(VarstringRec, '/var/lib/HPCCSystems/mydropzone/Varstring.parquet');
VarunicodeData := ParquetIO.Read(VarunicodeRec, '/var/lib/HPCCSystems/mydropzone/Varunicode.parquet');

// Output datasets read from Parquet files
OUTPUT(BooleanData, NAMED('BooleanData'));
OUTPUT(IntegerData, NAMED('IntegerData'));
OUTPUT(RealData, NAMED('RealData'));
Expand Down
6 changes: 2 additions & 4 deletions testing/regress/ecl/parquet_overwrite.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ IMPORT Parquet;

SingleRowDataset := DATASET([{1, 'SingleRow', TRUE}], {UNSIGNED id, STRING name, BOOLEAN flag});

ParquetIO.write(SingleRowDataset, '/var/lib/HPCCSystems/mydropzone/SingleRowTest.parquet');
writeParquetFile := ParquetIO.write(SingleRowDataset, '/var/lib/HPCCSystems/mydropzone/SingleRowTest.parquet');

ConflictingDataset := DATASET([{2, 'OverwrittenRow', 123}], {UNSIGNED id, STRING name, INTEGER conflict}); // Schema conflict

ParquetIO.write(ConflictingDataset, '/var/lib/HPCCSystems/mydropzone/SingleRowTest.parquet'); // Assuming TRUE should force overwrite if the parameter is valid
SEQUENTIAL(writeParquetFile, writeParquetFile);

0 comments on commit 27da5da

Please sign in to comment.