Skip to content

Commit

Permalink
Flav/refactor text extraction (#6270)
Browse files Browse the repository at this point in the history
* Refactor text extraction to use streaming

* ✂️

* 📖
  • Loading branch information
flvndvd authored Jul 17, 2024
1 parent 230dbe3 commit 56470a2
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 115 deletions.
102 changes: 6 additions & 96 deletions types/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion types/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@
},
"dependencies": {
"@notionhq/client": "^2.2.4",
"cheerio": "^1.0.0-rc.12",
"dts-cli": "^2.0.5",
"eslint-plugin-simple-import-sort": "^12.1.0",
"eventsource-parser": "^1.1.1",
"hot-shots": "^10.0.0",
"htmlparser2": "^9.1.0",
"io-ts": "^2.2.20",
"io-ts-reporters": "^2.0.1",
"io-ts-types": "^0.5.19",
Expand Down
106 changes: 88 additions & 18 deletions types/src/shared/text_extraction.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import * as cheerio from "cheerio";
import { isLeft } from "fp-ts/Either";
import { Parser } from "htmlparser2";
import * as t from "io-ts";
import * as reporter from "io-ts-reporters";
import { Readable } from "stream";

import { Err, Ok, Result } from "./result";

Expand All @@ -24,10 +25,10 @@ interface ContentTypeConfig {
}

const contentTypeConfig: ContentTypeConfig = {
"application/pdf": { handler: "html", pageSelector: ".page" },
"application/pdf": { handler: "html", pageSelector: "page" },
"application/vnd.openxmlformats-officedocument.presentationml.presentation": {
handler: "html",
pageSelector: ".slide-content",
pageSelector: "slide-content",
},
};
const DEFAULT_HANDLER = "text";
Expand All @@ -45,7 +46,7 @@ export class TextExtraction {
return response;
}

return new Ok(this.processResponse(response.value));
return this.processResponse(response.value);
}

// Query the Tika server and return the response data.
Expand All @@ -72,7 +73,7 @@ export class TextExtraction {
});

if (!response.ok) {
return new Err(new Error(`HTTP error! status: ${response.status}`));
return new Err(new Error(`HTTP error status: ${response.status}`));
}

const data = await response.json();
Expand All @@ -92,7 +93,9 @@ export class TextExtraction {
}

// Process the Tika response and return an array of PageContent.
private processResponse(response: TikaResponse): PageContent[] {
private processResponse(
response: TikaResponse
): Promise<Result<PageContent[], Error>> {
const contentType = response["Content-Type"];

const pageSelector = contentTypeConfig[contentType]?.pageSelector;
Expand All @@ -107,24 +110,91 @@ export class TextExtraction {
private processContentBySelector(
response: TikaResponse,
contentSelector: string
): PageContent[] {
): Promise<Result<PageContent[], Error>> {
const html = response["X-TIKA:content"];
const $ = cheerio.load(html);
const contentDivs = $(contentSelector);

return contentDivs
.map((index, div) => ({
pageNumber: index + 1,
content: $(div).text()?.trim() || "",
}))
.get();

const stream = Readable.from(html);

// This logic extract the content of the page based on the selector.
// We use a streaming parser to avoid loading the entire content in memory.
return new Promise<Result<PageContent[], Error>>((resolve) => {
const contentDivs: PageContent[] = [];
let currentPageContent = "";
let insidePage = false;
let pageNumber = 0;
let pageDepth = 0;

const parser = new Parser(
{
onopentag(name, attribs) {
// Check if the current tag is the page selector.
// If it is, we are inside a page.
// This assumes that we don't have nested pages.
if (name === "div" && attribs.class === contentSelector) {
insidePage = true;
pageNumber++;
currentPageContent = "";
pageDepth = 1;
} else if (insidePage) {
// If we are inside a page, increment the page depth to handle nested divs.
// This is required to know when we are done with the page.
pageDepth++;
}
},
ontext(text) {
// If we are inside a page, append the text to the current page content.
if (insidePage) {
currentPageContent += text.trim() + " ";
}
},
onclosetag() {
// If we are inside a page, decrement the page depth.
if (insidePage) {
pageDepth--;
// If the page depth is 0, we are done with the page.
if (pageDepth === 0) {
insidePage = false;
if (currentPageContent.trim()) {
contentDivs.push({
pageNumber: pageNumber,
content: currentPageContent.trim(),
});
}
currentPageContent = "";
}
}
},
onerror(err) {
return resolve(new Err(err));
},
},
{ decodeEntities: true }
);

stream.on("data", (chunk: Buffer) => {
parser.write(chunk.toString());
});

stream.on("end", () => {
parser.end();
return resolve(new Ok(contentDivs));
});

stream.on("error", (err) => {
return resolve(new Err(err));
});
});
}

// Process default response.
private processDefaultResponse(response: TikaResponse): PageContent[] {
private processDefaultResponse(
response: TikaResponse
): Promise<Result<PageContent[], Error>> {
const content = response["X-TIKA:content"];

// Treat the entire content as a single page.
return [{ pageNumber: 1, content: content.trim() }];
return Promise.resolve(
new Ok([{ pageNumber: 1, content: content.trim() }])
);
}
}

0 comments on commit 56470a2

Please sign in to comment.