From 95cd0259b64418d57c5d1168c47a6cdfb13b9359 Mon Sep 17 00:00:00 2001 From: filou Date: Tue, 17 Dec 2024 23:01:12 +0100 Subject: [PATCH 1/5] folders update --- .../webcrawler/temporal/activities.ts | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts index 1c26d76ef531..8d71fb8e86d2 100644 --- a/connectors/src/connectors/webcrawler/temporal/activities.ts +++ b/connectors/src/connectors/webcrawler/temporal/activities.ts @@ -251,8 +251,13 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { totalExtracted += extracted.length; const pageTitle = $("title").text(); - const folders = getAllFoldersForUrl(request.url); - for (const folder of folders) { + // note that parentFolderUrls.length === parentFolderIds.length -1 + // since parentFolderIds includes the page as first element + // and parentFolderUrls does not + const parentFolderUrls = getAllFoldersForUrl(request.url); + const parentFolderIds = getParentsForPage(request.url, false); + + for (const [index, folder] of parentFolderUrls.entries()) { if (createdFolders.has(folder)) { continue; } @@ -260,7 +265,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { const logicalParent = isTopFolder(request.url) ? null : getFolderForUrl(folder); - await WebCrawlerFolder.upsert({ + const [webCrawlerFolder] = await WebCrawlerFolder.upsert({ url: folder, parentUrl: logicalParent, connectorId: connector.id, @@ -272,6 +277,18 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { lastSeenAt: new Date(), }); + await upsertFolderNode({ + dataSourceConfig, + folderId: webCrawlerFolder.internalId, + timestampMs: webCrawlerFolder.updatedAt.getTime(), + + // parent folder ids of the page are in hierarchy order from the + // page to the root so for the current folder, its parents start at + // index+1 (including itself as first parent) and end at the root + parents: parentFolderIds.slice(index + 1), + title: folder, + }); + createdFolders.add(folder); } const documentId = stableIdForUrl({ @@ -342,7 +359,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { documentUrl: validatedUrl.standardized, timestampMs: new Date().getTime(), tags: [`title:${stripNullBytes(pageTitle)}`], - parents: getParentsForPage(request.url, false), + parents: parentFolderIds, upsertContext: { sync_type: "batch", }, @@ -560,3 +577,15 @@ export async function webCrawlerGarbageCollector( export async function getConnectorIdsForWebsitesToCrawl() { return WebCrawlerConfigurationResource.getConnectorIdsForWebsitesToCrawl(); } +function upsertFolderNode(arg0: { + dataSourceConfig: import("../../../types/data_source_config").DataSourceConfig; + folderId: string; + timestampMs: number; + // parent folder ids of the page are in hierarchy order from the + // page to the root so for the current folder, its parents start at + // index+1 (including itself as first parent) and end at the root + parents: string[]; + title: string; +}) { + throw new Error("Function not implemented."); +} From 93a0e1edbbb24f076b8b6e340c41804057dc9d43 Mon Sep 17 00:00:00 2001 From: filou Date: Wed, 18 Dec 2024 17:43:49 +0100 Subject: [PATCH 2/5] delete folders --- .../webcrawler/temporal/activities.ts | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts index 8d71fb8e86d2..cea57969e776 100644 --- a/connectors/src/connectors/webcrawler/temporal/activities.ts +++ b/connectors/src/connectors/webcrawler/temporal/activities.ts @@ -30,8 +30,10 @@ import { import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { deleteDataSourceDocument, + deleteDataSourceFolder, MAX_SMALL_DOCUMENT_TXT_LEN, upsertDataSourceDocument, + upsertDataSourceFolder, } from "@connectors/lib/data_sources"; import { WebCrawlerFolder, @@ -277,7 +279,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { lastSeenAt: new Date(), }); - await upsertFolderNode({ + await upsertDataSourceFolder({ dataSourceConfig, folderId: webCrawlerFolder.internalId, timestampMs: webCrawlerFolder.updatedAt.getTime(), @@ -287,6 +289,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { // index+1 (including itself as first parent) and end at the root parents: parentFolderIds.slice(index + 1), title: folder, + mimeType: "application/vnd.dust.webcrawler.folder", }); createdFolders.add(folder); @@ -569,6 +572,10 @@ export async function webCrawlerGarbageCollector( type: "delete_folder", }); for (const folder of foldersToDelete) { + await deleteDataSourceFolder({ + dataSourceConfig, + folderId: folder.internalId, + }); await folder.destroy(); } } while (foldersToDelete.length > 0); @@ -577,15 +584,3 @@ export async function webCrawlerGarbageCollector( export async function getConnectorIdsForWebsitesToCrawl() { return WebCrawlerConfigurationResource.getConnectorIdsForWebsitesToCrawl(); } -function upsertFolderNode(arg0: { - dataSourceConfig: import("../../../types/data_source_config").DataSourceConfig; - folderId: string; - timestampMs: number; - // parent folder ids of the page are in hierarchy order from the - // page to the root so for the current folder, its parents start at - // index+1 (including itself as first parent) and end at the root - parents: string[]; - title: string; -}) { - throw new Error("Function not implemented."); -} From 8fac453a858936ca4627a4660305643202100d06 Mon Sep 17 00:00:00 2001 From: filou Date: Wed, 18 Dec 2024 18:14:34 +0100 Subject: [PATCH 3/5] backfill --- .../20241218_backfill_webcrawler_folders.ts | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 connectors/migrations/20241218_backfill_webcrawler_folders.ts diff --git a/connectors/migrations/20241218_backfill_webcrawler_folders.ts b/connectors/migrations/20241218_backfill_webcrawler_folders.ts new file mode 100644 index 000000000000..81a9c5264d83 --- /dev/null +++ b/connectors/migrations/20241218_backfill_webcrawler_folders.ts @@ -0,0 +1,86 @@ +import { makeScript } from "scripts/helpers"; + +import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; +import { concurrentExecutor } from "@dust-tt/types"; +import { upsertDataSourceFolder } from "@connectors/lib/data_sources"; +import { ConnectorResource } from "@connectors/resources/connector_resource"; +import { WebCrawlerFolder } from "@connectors/lib/models/webcrawler"; +import _ from "lodash"; +import assert from "node:assert"; + +makeScript( + { + nextConnectorId: { + type: "number", + required: false, + default: 0, + }, + connectorId: { + type: "number", + required: false, + default: 0, + }, + }, + async ({ execute, nextConnectorId }, logger) => { + logger.info( + { + nextConnectorId, + }, + "Starting backfill" + ); + + const connectors = await ConnectorResource.listByType("webcrawler", {}); + + // sort connectors by id and start from nextConnectorId + const sortedConnectors = connectors + .sort((a, b) => a.id - b.id) + .filter((_, idx) => idx >= nextConnectorId); + + for (const connector of sortedConnectors) { + const dataSourceConfig = dataSourceConfigFromConnector(connector); + const connectorId = connector.id; + + const folders = await WebCrawlerFolder.findAll({ + where: { + connectorId, + }, + }); + + const foldersByUrl = _.keyBy(folders, "url"); + + const getParents = (folder: WebCrawlerFolder): string[] => { + assert( + folder.parentUrl === null || foldersByUrl[folder.parentUrl], + "Parent folder not found" + ); + const parentFolder = folder.parentUrl + ? foldersByUrl[folder.parentUrl] + : null; + return [ + folder.internalId, + ...(parentFolder ? getParents(parentFolder) : []), + ]; + }; + await concurrentExecutor( + folders, + async (folder) => { + logger.info({ + folder, + execute, + }); + if (execute) { + await upsertDataSourceFolder({ + dataSourceConfig, + folderId: folder.internalId, + timestampMs: folder.updatedAt.getTime(), + parents: getParents(folder), + title: folder.url, + mimeType: "application/vnd.dust.webcrawler.folder", + }); + } + }, + { concurrency: 8 } + ); + } + } +); From d04888339d75e466856d676d5bff235001d2d7d3 Mon Sep 17 00:00:00 2001 From: filou Date: Wed, 18 Dec 2024 18:36:32 +0100 Subject: [PATCH 4/5] clean --- .../migrations/20241218_backfill_webcrawler_folders.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/connectors/migrations/20241218_backfill_webcrawler_folders.ts b/connectors/migrations/20241218_backfill_webcrawler_folders.ts index 81a9c5264d83..906765003a28 100644 --- a/connectors/migrations/20241218_backfill_webcrawler_folders.ts +++ b/connectors/migrations/20241218_backfill_webcrawler_folders.ts @@ -1,12 +1,13 @@ +import assert from "node:assert"; + +import { concurrentExecutor } from "@dust-tt/types"; +import _ from "lodash"; import { makeScript } from "scripts/helpers"; import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; -import { concurrentExecutor } from "@dust-tt/types"; import { upsertDataSourceFolder } from "@connectors/lib/data_sources"; -import { ConnectorResource } from "@connectors/resources/connector_resource"; import { WebCrawlerFolder } from "@connectors/lib/models/webcrawler"; -import _ from "lodash"; -import assert from "node:assert"; +import { ConnectorResource } from "@connectors/resources/connector_resource"; makeScript( { From 80ca9cd6e507bf49fa5d58c16ed71e6a123a2931 Mon Sep 17 00:00:00 2001 From: filou Date: Wed, 18 Dec 2024 18:37:55 +0100 Subject: [PATCH 5/5] log --- .../migrations/20241218_backfill_webcrawler_folders.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/connectors/migrations/20241218_backfill_webcrawler_folders.ts b/connectors/migrations/20241218_backfill_webcrawler_folders.ts index 906765003a28..edbaacd52132 100644 --- a/connectors/migrations/20241218_backfill_webcrawler_folders.ts +++ b/connectors/migrations/20241218_backfill_webcrawler_folders.ts @@ -66,11 +66,12 @@ makeScript( folders, async (folder) => { logger.info({ - folder, + folderId: folder.internalId, + folderUrl: folder.url, execute, }); if (execute) { - await upsertDataSourceFolder({ + const result = await upsertDataSourceFolder({ dataSourceConfig, folderId: folder.internalId, timestampMs: folder.updatedAt.getTime(), @@ -78,6 +79,11 @@ makeScript( title: folder.url, mimeType: "application/vnd.dust.webcrawler.folder", }); + logger.info({ + result, + folderId: folder.internalId, + folderUrl: folder.url, + }); } }, { concurrency: 8 }