Skip to content

Commit

Permalink
570 document api return object (#608)
Browse files Browse the repository at this point in the history
* Add support for fetching single document in documents folder

* Add document object to upload + support link scraping via API

* hotfixes for documentation

* update api docs
  • Loading branch information
timothycarambat authored Jan 17, 2024
1 parent c61cbd1 commit b35feed
Show file tree
Hide file tree
Showing 14 changed files with 324 additions and 43 deletions.
16 changes: 12 additions & 4 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,21 @@ app.post("/process", async function (request, response) {
const targetFilename = path
.normalize(filename)
.replace(/^(\.\.(\/|\\|$))+/, "");
const { success, reason } = await processSingleFile(targetFilename);
response.status(200).json({ filename: targetFilename, success, reason });
const {
success,
reason,
documents = [],
} = await processSingleFile(targetFilename);
response
.status(200)
.json({ filename: targetFilename, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: filename,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
Expand All @@ -45,14 +52,15 @@ app.post("/process", async function (request, response) {
app.post("/process-link", async function (request, response) {
const { link } = reqBody(request);
try {
const { success, reason } = await processLink(link);
response.status(200).json({ url: link, success, reason });
const { success, reason, documents = [] } = await processLink(link);
response.status(200).json({ url: link, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
url: link,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
Expand Down
13 changes: 10 additions & 3 deletions collector/processLink/convert/generic.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ async function scrapeGenericUrl(link) {

if (!content.length) {
console.error(`Resulting URL content was empty at ${link}.`);
return { success: false, reason: `No URL content found at ${link}.` };
return {
success: false,
reason: `No URL content found at ${link}.`,
documents: [],
};
}

const url = new URL(link);
Expand All @@ -32,9 +36,12 @@ async function scrapeGenericUrl(link) {
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, `url-${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`url-${slugify(filename)}-${data.id}`
);
console.log(`[SUCCESS]: URL ${link} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}

async function getPageContent(link) {
Expand Down
14 changes: 11 additions & 3 deletions collector/processSingleFile/convert/asAudio.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
return {
success: false,
reason: `Failed to parse content from ${filename}.`,
documents: [],
};
}

Expand All @@ -43,7 +44,11 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
if (!content.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}

const data = {
Expand All @@ -60,12 +65,15 @@ async function asAudio({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(
`[SUCCESS]: ${filename} transcribed, converted & ready for embedding.\n`
);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}

async function convertToWavAudioData(sourcePath) {
Expand Down
13 changes: 10 additions & 3 deletions collector/processSingleFile/convert/asDocx.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
if (!pageContent.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}

const content = pageContent.join("");
Expand All @@ -42,10 +46,13 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}

module.exports = asDocX;
15 changes: 12 additions & 3 deletions collector/processSingleFile/convert/asMbox.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,15 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
if (!mails.length) {
console.error(`Resulting mail items was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No mail items found in ${filename}.` };
return {
success: false,
reason: `No mail items found in ${filename}.`,
documents: [],
};
}

let item = 1;
const documents = [];
for (const mail of mails) {
if (!mail.hasOwnProperty("text")) continue;

Expand All @@ -52,14 +57,18 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
};

item++;
writeToServerDocuments(data, `${slugify(filename)}-${data.id}-msg-${item}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}-msg-${item}`
);
documents.push(document);
}

trashFile(fullFilePath);
console.log(
`[SUCCESS]: ${filename} messages converted & ready for embedding.\n`
);
return { success: true, reason: null };
return { success: true, reason: null, documents };
}

module.exports = asMbox;
13 changes: 10 additions & 3 deletions collector/processSingleFile/convert/asOfficeMime.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
if (!content.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}

const data = {
Expand All @@ -37,10 +41,13 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}

module.exports = asOfficeMime;
13 changes: 10 additions & 3 deletions collector/processSingleFile/convert/asPDF.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
if (!pageContent.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}

const content = pageContent.join("");
Expand All @@ -47,10 +51,13 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}

module.exports = asPDF;
13 changes: 10 additions & 3 deletions collector/processSingleFile/convert/asTxt.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
if (!content?.length) {
console.error(`Resulting text content was empty for ${filename}.`);
trashFile(fullFilePath);
return { success: false, reason: `No text content found in ${filename}.` };
return {
success: false,
reason: `No text content found in ${filename}.`,
documents: [],
};
}

console.log(`-- Working ${filename} --`);
Expand All @@ -37,10 +41,13 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
token_count_estimate: tokenizeString(content).length,
};

writeToServerDocuments(data, `${slugify(filename)}-${data.id}`);
const document = writeToServerDocuments(
data,
`${slugify(filename)}-${data.id}`
);
trashFile(fullFilePath);
console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
return { success: true, reason: null };
return { success: true, reason: null, documents: [document] };
}

module.exports = asTxt;
4 changes: 4 additions & 0 deletions collector/processSingleFile/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,21 @@ async function processSingleFile(targetFilename) {
return {
success: false,
reason: "Filename is a reserved filename and cannot be processed.",
documents: [],
};
if (!fs.existsSync(fullFilePath))
return {
success: false,
reason: "File does not exist in upload directory.",
documents: [],
};

const fileExtension = path.extname(fullFilePath).toLowerCase();
if (!fileExtension) {
return {
success: false,
reason: `No file extension found. This file cannot be processed.`,
documents: [],
};
}

Expand All @@ -33,6 +36,7 @@ async function processSingleFile(targetFilename) {
return {
success: false,
reason: `File extension ${fileExtension} not supported for parsing.`,
documents: [],
};
}

Expand Down
19 changes: 12 additions & 7 deletions collector/utils/files/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,19 @@ function writeToServerDocuments(
);
if (!fs.existsSync(destination))
fs.mkdirSync(destination, { recursive: true });
const destinationFilePath = path.resolve(destination, filename);
const destinationFilePath = path.resolve(destination, filename) + ".json";

fs.writeFileSync(
destinationFilePath + ".json",
JSON.stringify(data, null, 4),
{ encoding: "utf-8" }
);
return;
fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
encoding: "utf-8",
});

return {
...data,
// relative location string that can be passed into the /update-embeddings api
// that will work since we know the location exists and since we only allow
// 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
location: destinationFilePath.split("/").slice(-2).join("/"),
};
}

// When required we can wipe the entire collector hotdir and tmp storage in case
Expand Down
Loading

0 comments on commit b35feed

Please sign in to comment.