From 56470a225a835ce8f0693cb83738bcf02e2927b9 Mon Sep 17 00:00:00 2001 From: Flavien David Date: Wed, 17 Jul 2024 11:27:22 +0200 Subject: [PATCH] Flav/refactor text extraction (#6270) * Refactor text extraction to use streaming * :scissors: * :book: --- types/package-lock.json | 102 ++------------------------ types/package.json | 2 +- types/src/shared/text_extraction.ts | 106 +++++++++++++++++++++++----- 3 files changed, 95 insertions(+), 115 deletions(-) diff --git a/types/package-lock.json b/types/package-lock.json index ee42602ac348..e794ff6b0cdb 100644 --- a/types/package-lock.json +++ b/types/package-lock.json @@ -9,11 +9,11 @@ "version": "0.1.0", "dependencies": { "@notionhq/client": "^2.2.4", - "cheerio": "^1.0.0-rc.12", "dts-cli": "^2.0.5", "eslint-plugin-simple-import-sort": "^12.1.0", "eventsource-parser": "^1.1.1", "hot-shots": "^10.0.0", + "htmlparser2": "^9.1.0", "io-ts": "^2.2.20", "io-ts-reporters": "^2.0.1", "io-ts-types": "^0.5.19", @@ -3528,11 +3528,6 @@ "readable-stream": "^3.4.0" } }, - "node_modules/boolbase": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", - "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==" - }, "node_modules/brace-expansion": { "version": "1.1.11", "license": "MIT", @@ -3713,42 +3708,6 @@ "node": ">=10" } }, - "node_modules/cheerio": { - "version": "1.0.0-rc.12", - "resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0-rc.12.tgz", - "integrity": "sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==", - "dependencies": { - "cheerio-select": "^2.1.0", - "dom-serializer": "^2.0.0", - "domhandler": "^5.0.3", - "domutils": "^3.0.1", - "htmlparser2": "^8.0.1", - "parse5": "^7.0.0", - "parse5-htmlparser2-tree-adapter": "^7.0.0" - }, - "engines": { - "node": ">= 6" - }, - "funding": { - "url": "https://github.com/cheeriojs/cheerio?sponsor=1" - } - }, - "node_modules/cheerio-select": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz", - "integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==", - "dependencies": { - "boolbase": "^1.0.0", - "css-select": "^5.1.0", - "css-what": "^6.1.0", - "domelementtype": "^2.3.0", - "domhandler": "^5.0.3", - "domutils": "^3.0.1" - }, - "funding": { - "url": "https://github.com/sponsors/fb55" - } - }, "node_modules/ci-info": { "version": "3.9.0", "funding": [ @@ -3975,32 +3934,6 @@ "node": ">= 8" } }, - "node_modules/css-select": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz", - "integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==", - "dependencies": { - "boolbase": "^1.0.0", - "css-what": "^6.1.0", - "domhandler": "^5.0.2", - "domutils": "^3.0.1", - "nth-check": "^2.0.1" - }, - "funding": { - "url": "https://github.com/sponsors/fb55" - } - }, - "node_modules/css-what": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz", - "integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==", - "engines": { - "node": ">= 6" - }, - "funding": { - "url": "https://github.com/sponsors/fb55" - } - }, "node_modules/cssom": { "version": "0.5.0", "license": "MIT" @@ -5766,9 +5699,9 @@ "license": "MIT" }, "node_modules/htmlparser2": { - "version": "8.0.2", - "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", - "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz", + "integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==", "funding": [ "https://github.com/fb55/htmlparser2?sponsor=1", { @@ -5779,8 +5712,8 @@ "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", - "domutils": "^3.0.1", - "entities": "^4.4.0" + "domutils": "^3.1.0", + "entities": "^4.5.0" } }, "node_modules/http-proxy-agent": { @@ -7668,17 +7601,6 @@ "node": ">=8" } }, - "node_modules/nth-check": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", - "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", - "dependencies": { - "boolbase": "^1.0.0" - }, - "funding": { - "url": "https://github.com/fb55/nth-check?sponsor=1" - } - }, "node_modules/nwsapi": { "version": "2.2.7", "license": "MIT" @@ -7938,18 +7860,6 @@ "url": "https://github.com/inikulin/parse5?sponsor=1" } }, - "node_modules/parse5-htmlparser2-tree-adapter": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.0.0.tgz", - "integrity": "sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==", - "dependencies": { - "domhandler": "^5.0.2", - "parse5": "^7.0.0" - }, - "funding": { - "url": "https://github.com/inikulin/parse5?sponsor=1" - } - }, "node_modules/pascal-case": { "version": "3.1.2", "license": "MIT", diff --git a/types/package.json b/types/package.json index cc59a0553098..49b4a33ac660 100644 --- a/types/package.json +++ b/types/package.json @@ -27,11 +27,11 @@ }, "dependencies": { "@notionhq/client": "^2.2.4", - "cheerio": "^1.0.0-rc.12", "dts-cli": "^2.0.5", "eslint-plugin-simple-import-sort": "^12.1.0", "eventsource-parser": "^1.1.1", "hot-shots": "^10.0.0", + "htmlparser2": "^9.1.0", "io-ts": "^2.2.20", "io-ts-reporters": "^2.0.1", "io-ts-types": "^0.5.19", diff --git a/types/src/shared/text_extraction.ts b/types/src/shared/text_extraction.ts index 721929b77929..5703c100ac04 100644 --- a/types/src/shared/text_extraction.ts +++ b/types/src/shared/text_extraction.ts @@ -1,7 +1,8 @@ -import * as cheerio from "cheerio"; import { isLeft } from "fp-ts/Either"; +import { Parser } from "htmlparser2"; import * as t from "io-ts"; import * as reporter from "io-ts-reporters"; +import { Readable } from "stream"; import { Err, Ok, Result } from "./result"; @@ -24,10 +25,10 @@ interface ContentTypeConfig { } const contentTypeConfig: ContentTypeConfig = { - "application/pdf": { handler: "html", pageSelector: ".page" }, + "application/pdf": { handler: "html", pageSelector: "page" }, "application/vnd.openxmlformats-officedocument.presentationml.presentation": { handler: "html", - pageSelector: ".slide-content", + pageSelector: "slide-content", }, }; const DEFAULT_HANDLER = "text"; @@ -45,7 +46,7 @@ export class TextExtraction { return response; } - return new Ok(this.processResponse(response.value)); + return this.processResponse(response.value); } // Query the Tika server and return the response data. @@ -72,7 +73,7 @@ export class TextExtraction { }); if (!response.ok) { - return new Err(new Error(`HTTP error! status: ${response.status}`)); + return new Err(new Error(`HTTP error status: ${response.status}`)); } const data = await response.json(); @@ -92,7 +93,9 @@ export class TextExtraction { } // Process the Tika response and return an array of PageContent. - private processResponse(response: TikaResponse): PageContent[] { + private processResponse( + response: TikaResponse + ): Promise> { const contentType = response["Content-Type"]; const pageSelector = contentTypeConfig[contentType]?.pageSelector; @@ -107,24 +110,91 @@ export class TextExtraction { private processContentBySelector( response: TikaResponse, contentSelector: string - ): PageContent[] { + ): Promise> { const html = response["X-TIKA:content"]; - const $ = cheerio.load(html); - const contentDivs = $(contentSelector); - - return contentDivs - .map((index, div) => ({ - pageNumber: index + 1, - content: $(div).text()?.trim() || "", - })) - .get(); + + const stream = Readable.from(html); + + // This logic extract the content of the page based on the selector. + // We use a streaming parser to avoid loading the entire content in memory. + return new Promise>((resolve) => { + const contentDivs: PageContent[] = []; + let currentPageContent = ""; + let insidePage = false; + let pageNumber = 0; + let pageDepth = 0; + + const parser = new Parser( + { + onopentag(name, attribs) { + // Check if the current tag is the page selector. + // If it is, we are inside a page. + // This assumes that we don't have nested pages. + if (name === "div" && attribs.class === contentSelector) { + insidePage = true; + pageNumber++; + currentPageContent = ""; + pageDepth = 1; + } else if (insidePage) { + // If we are inside a page, increment the page depth to handle nested divs. + // This is required to know when we are done with the page. + pageDepth++; + } + }, + ontext(text) { + // If we are inside a page, append the text to the current page content. + if (insidePage) { + currentPageContent += text.trim() + " "; + } + }, + onclosetag() { + // If we are inside a page, decrement the page depth. + if (insidePage) { + pageDepth--; + // If the page depth is 0, we are done with the page. + if (pageDepth === 0) { + insidePage = false; + if (currentPageContent.trim()) { + contentDivs.push({ + pageNumber: pageNumber, + content: currentPageContent.trim(), + }); + } + currentPageContent = ""; + } + } + }, + onerror(err) { + return resolve(new Err(err)); + }, + }, + { decodeEntities: true } + ); + + stream.on("data", (chunk: Buffer) => { + parser.write(chunk.toString()); + }); + + stream.on("end", () => { + parser.end(); + return resolve(new Ok(contentDivs)); + }); + + stream.on("error", (err) => { + return resolve(new Err(err)); + }); + }); } // Process default response. - private processDefaultResponse(response: TikaResponse): PageContent[] { + private processDefaultResponse( + response: TikaResponse + ): Promise> { const content = response["X-TIKA:content"]; // Treat the entire content as a single page. - return [{ pageNumber: 1, content: content.trim() }]; + return Promise.resolve( + new Ok([{ pageNumber: 1, content: content.trim() }]) + ); } }