From 20e43c94ac9f70e78bf5d8755d0559cdccda5e7e Mon Sep 17 00:00:00 2001 From: Aric Lasry Date: Thu, 4 Jul 2024 18:35:53 +0200 Subject: [PATCH] Moving docx2text and pptx2text to a shared workerpool (#6087) --- connectors/package-lock.json | 6 ++++++ connectors/package.json | 1 + .../src/connectors/google_drive/temporal/file.ts | 2 +- connectors/src/lib/{docx2text..ts => docx2text.ts} | 8 +++++++- connectors/src/lib/pptx2text.ts | 11 ++++++++++- connectors/src/lib/workerpool.ts | 9 +++++++++ .../connectors-worker-google-drive-deployment.yaml | 4 ++-- 7 files changed, 36 insertions(+), 5 deletions(-) rename connectors/src/lib/{docx2text..ts => docx2text.ts} (76%) create mode 100644 connectors/src/lib/workerpool.ts diff --git a/connectors/package-lock.json b/connectors/package-lock.json index 32db69afa0ea..f44c38c14fb7 100644 --- a/connectors/package-lock.json +++ b/connectors/package-lock.json @@ -60,6 +60,7 @@ "tsconfig-paths-webpack-plugin": "^4.1.0", "turndown": "^7.1.2", "uuid": "^9.0.0", + "workerpool": "^9.1.3", "yargs": "^17.7.2" }, "devDependencies": { @@ -12162,6 +12163,11 @@ "@types/node": "*" } }, + "node_modules/workerpool": { + "version": "9.1.3", + "resolved": "https://registry.npmjs.org/workerpool/-/workerpool-9.1.3.tgz", + "integrity": "sha512-LhUrk4tbxJRDQmRrrFWA9EnboXI79fe0ZNTy3u8m+dqPN1EkVSIsQYAB8OF/fkyhG8Rtup+c/bzj/+bzbG8fqg==" + }, "node_modules/wrap-ansi": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", diff --git a/connectors/package.json b/connectors/package.json index 654374c88857..5006b058e3ad 100644 --- a/connectors/package.json +++ b/connectors/package.json @@ -66,6 +66,7 @@ "tsconfig-paths-webpack-plugin": "^4.1.0", "turndown": "^7.1.2", "uuid": "^9.0.0", + "workerpool": "^9.1.3", "yargs": "^17.7.2" }, "devDependencies": { diff --git a/connectors/src/connectors/google_drive/temporal/file.ts b/connectors/src/connectors/google_drive/temporal/file.ts index 7989b71091da..55e326fe1a35 100644 --- a/connectors/src/connectors/google_drive/temporal/file.ts +++ b/connectors/src/connectors/google_drive/temporal/file.ts @@ -25,7 +25,7 @@ import { sectionLength, upsertToDatasource, } from "@connectors/lib/data_sources"; -import { docx2text } from "@connectors/lib/docx2text."; +import { docx2text } from "@connectors/lib/docx2text"; import { dpdf2text } from "@connectors/lib/dpdf2text"; import { GoogleDriveConfig, diff --git a/connectors/src/lib/docx2text..ts b/connectors/src/lib/docx2text.ts similarity index 76% rename from connectors/src/lib/docx2text..ts rename to connectors/src/lib/docx2text.ts index d8f4f5e7233a..71681a893809 100644 --- a/connectors/src/lib/docx2text..ts +++ b/connectors/src/lib/docx2text.ts @@ -2,7 +2,9 @@ import tracer from "dd-trace"; import mammoth from "mammoth"; import turndown from "turndown"; -export async function docx2text(fileContent: Buffer, filename: string) { +import { getWorkerPool } from "@connectors/lib/workerpool"; + +async function _docx2text(fileContent: Buffer, filename: string) { return tracer.trace( `gdrive`, { @@ -23,3 +25,7 @@ export async function docx2text(fileContent: Buffer, filename: string) { } ); } + +export async function docx2text(fileContent: Buffer, filename: string) { + return getWorkerPool().exec(_docx2text, [fileContent, filename]); +} diff --git a/connectors/src/lib/pptx2text.ts b/connectors/src/lib/pptx2text.ts index 3cd531f6f439..650d9ef598ef 100644 --- a/connectors/src/lib/pptx2text.ts +++ b/connectors/src/lib/pptx2text.ts @@ -2,13 +2,15 @@ import tracer from "dd-trace"; import JSZip from "jszip"; import turndown from "turndown"; +import { getWorkerPool } from "@connectors/lib/workerpool"; + type PPTXDocument = { pages: { content: string; }[]; }; -export async function PPTX2Text( +async function _PPTX2Text( fileBuffer: Buffer, filename?: string ): Promise { @@ -70,3 +72,10 @@ export async function PPTX2Text( } ); } + +export async function PPTX2Text( + fileBuffer: Buffer, + filename?: string +): Promise { + return getWorkerPool().exec(_PPTX2Text, [fileBuffer, filename]); +} diff --git a/connectors/src/lib/workerpool.ts b/connectors/src/lib/workerpool.ts new file mode 100644 index 000000000000..ca2ed623c9d2 --- /dev/null +++ b/connectors/src/lib/workerpool.ts @@ -0,0 +1,9 @@ +import workerpool from "workerpool"; + +let POOL: ReturnType | null = null; +export function getWorkerPool() { + if (!POOL) { + POOL = workerpool.pool({ maxWorkers: 3 }); + } + return POOL; +} diff --git a/k8s/deployments/connectors-worker-google-drive-deployment.yaml b/k8s/deployments/connectors-worker-google-drive-deployment.yaml index fc64d63afd01..c7e9a35b4aaa 100644 --- a/k8s/deployments/connectors-worker-google-drive-deployment.yaml +++ b/k8s/deployments/connectors-worker-google-drive-deployment.yaml @@ -47,12 +47,12 @@ spec: resources: requests: - cpu: 2000m + cpu: 3000m memory: 4Gi ephemeral-storage: 4Gi limits: - cpu: 2000m + cpu: 3000m memory: 4Gi ephemeral-storage: 4Gi