Skip to content

Commit

Permalink
Flav/apache tika setup (#6274)
Browse files Browse the repository at this point in the history
* Add apache-tika in docker-compose

* ✨
  • Loading branch information
flvndvd authored Jul 17, 2024
1 parent 58af3e0 commit c7d175e
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 17 deletions.
14 changes: 6 additions & 8 deletions connectors/src/connectors/google_drive/temporal/file.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import type { CoreAPIDataSourceDocumentSection, ModelId } from "@dust-tt/types";
import { slugify, TextExtraction } from "@dust-tt/types";
import {
isTextExtractionSupportedContentType,
slugify,
TextExtraction,
} from "@dust-tt/types";
import tracer from "dd-trace";
import type { OAuth2Client } from "googleapis-common";
import type { CreationAttributes } from "sequelize";
Expand Down Expand Up @@ -234,13 +238,7 @@ export async function syncOneFile(
"Unexpected GDrive export response type"
);
}
} else if (
[
"application/pdf",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
].includes(file.mimeType)
) {
} else if (isTextExtractionSupportedContentType(file.mimeType)) {
if (!(res.data instanceof ArrayBuffer)) {
localLogger.error(
{ mimeType: file.mimeType },
Expand Down
4 changes: 4 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ services:
image: redis
ports:
- 6379:6379
apache-tika:
image: apache/tika:2.9.2.1
ports:
- "9998:9998"

volumes:
pgsql:
Expand Down
36 changes: 27 additions & 9 deletions types/src/shared/text_extraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,22 @@ interface PageContent {
content: string;
}

interface ContentTypeConfig {
[key: string]: { handler: string; pageSelector: string };
}
// All those content types are supported by the Tika server.
// Before adding a new content type, make sure to test it.
const supportedContentTypes = [
"application/pdf",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
] as const;

type SupportedContentTypes = (typeof supportedContentTypes)[number];

type ContentTypeConfig = {
[key in SupportedContentTypes]?: {
handler: "html" | "text";
pageSelector?: string;
};
};

const contentTypeConfig: ContentTypeConfig = {
"application/pdf": { handler: "html", pageSelector: "page" },
Expand All @@ -39,7 +52,7 @@ export class TextExtraction {
// Method to extract text from a buffer.
async fromBuffer(
fileBuffer: Buffer,
contentType: string
contentType: SupportedContentTypes
): Promise<Result<PageContent[], Error>> {
const response = await this.queryTika(fileBuffer, contentType);
if (response.isErr()) {
Expand All @@ -52,15 +65,13 @@ export class TextExtraction {
// Query the Tika server and return the response data.
private async queryTika(
fileBuffer: Buffer,
contentType: string
contentType: SupportedContentTypes
): Promise<Result<TikaResponse, Error>> {
// Determine the handler type based on the content type.
// The HTML handler preserves the structural information of the document
// like page structure, etc. The text handler does not.
const handlerType =
contentType in contentTypeConfig
? contentTypeConfig[contentType].handler
: DEFAULT_HANDLER;
contentTypeConfig[contentType]?.handler ?? DEFAULT_HANDLER;

try {
const response = await fetch(`${this.url}/tika/${handlerType}`, {
Expand Down Expand Up @@ -98,7 +109,8 @@ export class TextExtraction {
): Promise<Result<PageContent[], Error>> {
const contentType = response["Content-Type"];

const pageSelector = contentTypeConfig[contentType]?.pageSelector;
const pageSelector =
contentTypeConfig[contentType as SupportedContentTypes]?.pageSelector;
if (pageSelector) {
return this.processContentBySelector(response, pageSelector);
}
Expand Down Expand Up @@ -198,3 +210,9 @@ export class TextExtraction {
);
}
}

export function isTextExtractionSupportedContentType(
contentType: string
): contentType is SupportedContentTypes {
return supportedContentTypes.includes(contentType as SupportedContentTypes);
}

0 comments on commit c7d175e

Please sign in to comment.