From cdf5ee64e2865d611fa093a58dd08f67a00266b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daphn=C3=A9=20Popin?= Date: Mon, 22 Jul 2024 18:31:36 +0200 Subject: [PATCH] Lower size of docs from webcrawler (#6411) * Increase upsertToDatasource request timeout * Attempt to reduce size of content from webcrawler --- connectors/src/connectors/webcrawler/temporal/activities.ts | 4 ++-- connectors/src/lib/data_sources.ts | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts index ef69a1245243..51be583fecf4 100644 --- a/connectors/src/connectors/webcrawler/temporal/activities.ts +++ b/connectors/src/connectors/webcrawler/temporal/activities.ts @@ -23,7 +23,7 @@ import { import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { deleteFromDataSource, - MAX_DOCUMENT_TXT_LEN, + MAX_SMALL_DOCUMENT_TXT_LEN, upsertToDatasource, } from "@connectors/lib/data_sources"; import { @@ -274,7 +274,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { try { if ( extracted.length > 0 && - extracted.length <= MAX_DOCUMENT_TXT_LEN + extracted.length <= MAX_SMALL_DOCUMENT_TXT_LEN ) { await upsertToDatasource({ dataSourceConfig, diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index 68afd1433ab2..1ba937a48b9f 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -29,6 +29,7 @@ import type { DataSourceConfig } from "@connectors/types/data_source_config"; const axiosWithTimeout = axios.create({ timeout: 60000, // Ensure client timeout is lower than the target server timeout. + // See --keepAliveTimeout in next start command from front. httpAgent: new http.Agent({ keepAlive: true, keepAliveMsecs: 4000 }), httpsAgent: new https.Agent({ keepAlive: true, keepAliveMsecs: 4000 }), }); @@ -41,6 +42,8 @@ if (!DUST_FRONT_API) { // We limit the document size we support. Beyond a certain size, upsert is simply too slow (>300s) // and large files are generally less useful anyway. export const MAX_DOCUMENT_TXT_LEN = 750000; +// For some data sources we allow small documents only to be processed. +export const MAX_SMALL_DOCUMENT_TXT_LEN = 500000; // For some data sources we allow large documents (5mb) to be processed (behind flag). export const MAX_LARGE_DOCUMENT_TXT_LEN = 5000000;