Skip to content

Commit

Permalink
Moving docx2text and pptx2text to a shared workerpool (#6087)
Browse files Browse the repository at this point in the history
  • Loading branch information
lasryaric authored Jul 4, 2024
1 parent 3814d16 commit 20e43c9
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 5 deletions.
6 changes: 6 additions & 0 deletions connectors/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions connectors/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"tsconfig-paths-webpack-plugin": "^4.1.0",
"turndown": "^7.1.2",
"uuid": "^9.0.0",
"workerpool": "^9.1.3",
"yargs": "^17.7.2"
},
"devDependencies": {
Expand Down
2 changes: 1 addition & 1 deletion connectors/src/connectors/google_drive/temporal/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import {
sectionLength,
upsertToDatasource,
} from "@connectors/lib/data_sources";
import { docx2text } from "@connectors/lib/docx2text.";
import { docx2text } from "@connectors/lib/docx2text";
import { dpdf2text } from "@connectors/lib/dpdf2text";
import {
GoogleDriveConfig,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ import tracer from "dd-trace";
import mammoth from "mammoth";
import turndown from "turndown";

export async function docx2text(fileContent: Buffer, filename: string) {
import { getWorkerPool } from "@connectors/lib/workerpool";

async function _docx2text(fileContent: Buffer, filename: string) {
return tracer.trace(
`gdrive`,
{
Expand All @@ -23,3 +25,7 @@ export async function docx2text(fileContent: Buffer, filename: string) {
}
);
}

export async function docx2text(fileContent: Buffer, filename: string) {
return getWorkerPool().exec(_docx2text, [fileContent, filename]);
}
11 changes: 10 additions & 1 deletion connectors/src/lib/pptx2text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@ import tracer from "dd-trace";
import JSZip from "jszip";
import turndown from "turndown";

import { getWorkerPool } from "@connectors/lib/workerpool";

type PPTXDocument = {
pages: {
content: string;
}[];
};

export async function PPTX2Text(
async function _PPTX2Text(
fileBuffer: Buffer,
filename?: string
): Promise<PPTXDocument> {
Expand Down Expand Up @@ -70,3 +72,10 @@ export async function PPTX2Text(
}
);
}

export async function PPTX2Text(
fileBuffer: Buffer,
filename?: string
): Promise<PPTXDocument> {
return getWorkerPool().exec(_PPTX2Text, [fileBuffer, filename]);
}
9 changes: 9 additions & 0 deletions connectors/src/lib/workerpool.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import workerpool from "workerpool";

let POOL: ReturnType<typeof workerpool.pool> | null = null;
export function getWorkerPool() {
if (!POOL) {
POOL = workerpool.pool({ maxWorkers: 3 });
}
return POOL;
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,12 @@ spec:

resources:
requests:
cpu: 2000m
cpu: 3000m
memory: 4Gi
ephemeral-storage: 4Gi

limits:
cpu: 2000m
cpu: 3000m
memory: 4Gi
ephemeral-storage: 4Gi

Expand Down

0 comments on commit 20e43c9

Please sign in to comment.