-
-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* support xlsx files * lint * create seperate docs for each xlsx sheet * lint * use node-xlsx pkg for parsing xslx files * lint * update error handling --------- Co-authored-by: timothycarambat <[email protected]>
- Loading branch information
1 parent
93d6464
commit b658f50
Showing
4 changed files
with
132 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
const { v4 } = require("uuid"); | ||
const xlsx = require("node-xlsx").default; | ||
const path = require("path"); | ||
const fs = require("fs"); | ||
const { | ||
createdDate, | ||
trashFile, | ||
writeToServerDocuments, | ||
} = require("../../utils/files"); | ||
const { tokenizeString } = require("../../utils/tokenizer"); | ||
const { default: slugify } = require("slugify"); | ||
|
||
function convertToCSV(data) { | ||
return data | ||
.map((row) => | ||
row | ||
.map((cell) => { | ||
if (cell === null || cell === undefined) return ""; | ||
if (typeof cell === "string" && cell.includes(",")) | ||
return `"${cell}"`; | ||
return cell; | ||
}) | ||
.join(",") | ||
) | ||
.join("\n"); | ||
} | ||
|
||
async function asXlsx({ fullFilePath = "", filename = "" }) { | ||
const documents = []; | ||
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { | ||
lower: true, | ||
trim: true, | ||
}); | ||
|
||
const outFolderPath = | ||
process.env.NODE_ENV === "development" | ||
? path.resolve( | ||
__dirname, | ||
`../../../server/storage/documents/${folderName}` | ||
) | ||
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`); | ||
|
||
try { | ||
const workSheetsFromFile = xlsx.parse(fullFilePath); | ||
if (!fs.existsSync(outFolderPath)) | ||
fs.mkdirSync(outFolderPath, { recursive: true }); | ||
|
||
for (const sheet of workSheetsFromFile) { | ||
try { | ||
const { name, data } = sheet; | ||
const content = convertToCSV(data); | ||
|
||
if (!content?.length) { | ||
console.warn(`Sheet "${name}" is empty. Skipping.`); | ||
continue; | ||
} | ||
|
||
console.log(`-- Processing sheet: ${name} --`); | ||
const sheetData = { | ||
id: v4(), | ||
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, | ||
title: `${filename} - Sheet:${name}`, | ||
docAuthor: "Unknown", | ||
description: `Spreadsheet data from sheet: ${name}`, | ||
docSource: "an xlsx file uploaded by the user.", | ||
chunkSource: "", | ||
published: createdDate(fullFilePath), | ||
wordCount: content.split(/\s+/).length, | ||
pageContent: content, | ||
token_count_estimate: tokenizeString(content).length, | ||
}; | ||
|
||
const document = writeToServerDocuments( | ||
sheetData, | ||
`sheet-${slugify(name)}`, | ||
outFolderPath | ||
); | ||
documents.push(document); | ||
console.log( | ||
`[SUCCESS]: Sheet "${name}" converted & ready for embedding.` | ||
); | ||
} catch (err) { | ||
console.error(`Error processing sheet "${name}":`, err); | ||
continue; | ||
} | ||
} | ||
} catch (err) { | ||
console.error("Could not process xlsx file!", err); | ||
return { | ||
success: false, | ||
reason: `Error processing ${filename}: ${err.message}`, | ||
documents: [], | ||
}; | ||
} finally { | ||
trashFile(fullFilePath); | ||
} | ||
|
||
if (documents.length === 0) { | ||
console.error(`No valid sheets found in ${filename}.`); | ||
return { | ||
success: false, | ||
reason: `No valid sheets found in ${filename}.`, | ||
documents: [], | ||
}; | ||
} | ||
|
||
console.log( | ||
`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n` | ||
); | ||
return { success: true, reason: null, documents }; | ||
} | ||
|
||
module.exports = asXlsx; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13: | |
css-select "^5.1.0" | ||
he "1.2.0" | ||
|
||
node-xlsx@^0.24.0: | ||
version "0.24.0" | ||
resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f" | ||
integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg== | ||
dependencies: | ||
xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz" | ||
|
||
[email protected]: | ||
version "6.9.13" | ||
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6" | ||
|
@@ -3528,6 +3535,10 @@ [email protected]: | |
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f" | ||
integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g== | ||
|
||
"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz": | ||
version "0.20.2" | ||
resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d" | ||
|
||
xml2js@^0.6.2: | ||
version "0.6.2" | ||
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499" | ||
|