Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Mintplex-Labs/anything-llm
Browse files Browse the repository at this point in the history
  • Loading branch information
timothycarambat committed Oct 3, 2024
2 parents 4df4742 + b658f50 commit 31bc579
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 1 deletion.
3 changes: 2 additions & 1 deletion collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"mime": "^3.0.0",
"moment": "^2.29.4",
"node-html-parser": "^6.1.13",
"node-xlsx": "^0.24.0",
"officeparser": "^4.0.5",
"openai": "4.38.5",
"pdf-parse": "^1.1.1",
Expand All @@ -48,4 +49,4 @@
"nodemon": "^2.0.22",
"prettier": "^2.4.1"
}
}
}
113 changes: 113 additions & 0 deletions collector/processSingleFile/convert/asXlsx.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
const { v4 } = require("uuid");
const xlsx = require("node-xlsx").default;
const path = require("path");
const fs = require("fs");
const {
createdDate,
trashFile,
writeToServerDocuments,
} = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify");

function convertToCSV(data) {
return data
.map((row) =>
row
.map((cell) => {
if (cell === null || cell === undefined) return "";
if (typeof cell === "string" && cell.includes(","))
return `"${cell}"`;
return cell;
})
.join(",")
)
.join("\n");
}

async function asXlsx({ fullFilePath = "", filename = "" }) {
const documents = [];
const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
lower: true,
trim: true,
});

const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../server/storage/documents/${folderName}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${folderName}`);

try {
const workSheetsFromFile = xlsx.parse(fullFilePath);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });

for (const sheet of workSheetsFromFile) {
try {
const { name, data } = sheet;
const content = convertToCSV(data);

if (!content?.length) {
console.warn(`Sheet "${name}" is empty. Skipping.`);
continue;
}

console.log(`-- Processing sheet: ${name} --`);
const sheetData = {
id: v4(),
url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
title: `${filename} - Sheet:${name}`,
docAuthor: "Unknown",
description: `Spreadsheet data from sheet: ${name}`,
docSource: "an xlsx file uploaded by the user.",
chunkSource: "",
published: createdDate(fullFilePath),
wordCount: content.split(/\s+/).length,
pageContent: content,
token_count_estimate: tokenizeString(content).length,
};

const document = writeToServerDocuments(
sheetData,
`sheet-${slugify(name)}`,
outFolderPath
);
documents.push(document);
console.log(
`[SUCCESS]: Sheet "${name}" converted & ready for embedding.`
);
} catch (err) {
console.error(`Error processing sheet "${name}":`, err);
continue;
}
}
} catch (err) {
console.error("Could not process xlsx file!", err);
return {
success: false,
reason: `Error processing ${filename}: ${err.message}`,
documents: [],
};
} finally {
trashFile(fullFilePath);
}

if (documents.length === 0) {
console.error(`No valid sheets found in ${filename}.`);
return {
success: false,
reason: `No valid sheets found in ${filename}.`,
documents: [],
};
}

console.log(
`[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n`
);
return { success: true, reason: null, documents };
}

module.exports = asXlsx;
6 changes: 6 additions & 0 deletions collector/utils/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ const ACCEPTED_MIMES = {
".pptx",
],

"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": [
".xlsx",
],

"application/vnd.oasis.opendocument.text": [".odt"],
"application/vnd.oasis.opendocument.presentation": [".odp"],

Expand Down Expand Up @@ -41,6 +45,8 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".odt": "./convert/asOfficeMime.js",
".odp": "./convert/asOfficeMime.js",

".xlsx": "./convert/asXlsx.js",

".mbox": "./convert/asMbox.js",

".epub": "./convert/asEPub.js",
Expand Down
11 changes: 11 additions & 0 deletions collector/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -2326,6 +2326,13 @@ node-html-parser@^6.1.13:
css-select "^5.1.0"
he "1.2.0"

node-xlsx@^0.24.0:
version "0.24.0"
resolved "https://registry.yarnpkg.com/node-xlsx/-/node-xlsx-0.24.0.tgz#a6a365acb18ad37c66c2b254b6ebe0c22dc9dc6f"
integrity sha512-1olwK48XK9nXZsyH/FCltvGrQYvXXZuxVitxXXv2GIuRm51aBi1+5KwR4rWM4KeO61sFU+00913WLZTD+AcXEg==
dependencies:
xlsx "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz"

[email protected]:
version "6.9.13"
resolved "https://registry.yarnpkg.com/nodemailer/-/nodemailer-6.9.13.tgz#5b292bf1e92645f4852ca872c56a6ba6c4a3d3d6"
Expand Down Expand Up @@ -3528,6 +3535,10 @@ [email protected]:
resolved "https://registry.yarnpkg.com/ws/-/ws-8.14.2.tgz#6c249a806eb2db7a20d26d51e7709eab7b2e6c7f"
integrity sha512-wEBG1ftX4jcglPxgFCMJmZ2PLtSbJ2Peg6TmpJFTbe9GZYOQCDPdMYu/Tm0/bGZkw8paZnJY45J4K2PZrLYq8g==

"xlsx@https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz":
version "0.20.2"
resolved "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz#0f64eeed3f1a46e64724620c3553f2dbd3cd2d7d"

xml2js@^0.6.2:
version "0.6.2"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.2.tgz#dd0b630083aa09c161e25a4d0901e2b2a929b499"
Expand Down

0 comments on commit 31bc579

Please sign in to comment.