diff --git a/connectors/migrations/20241218_backfill_webcrawler_folders.ts b/connectors/migrations/20241218_backfill_webcrawler_folders.ts new file mode 100644 index 000000000000..edbaacd52132 --- /dev/null +++ b/connectors/migrations/20241218_backfill_webcrawler_folders.ts @@ -0,0 +1,93 @@ +import assert from "node:assert"; + +import { concurrentExecutor } from "@dust-tt/types"; +import _ from "lodash"; +import { makeScript } from "scripts/helpers"; + +import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; +import { upsertDataSourceFolder } from "@connectors/lib/data_sources"; +import { WebCrawlerFolder } from "@connectors/lib/models/webcrawler"; +import { ConnectorResource } from "@connectors/resources/connector_resource"; + +makeScript( + { + nextConnectorId: { + type: "number", + required: false, + default: 0, + }, + connectorId: { + type: "number", + required: false, + default: 0, + }, + }, + async ({ execute, nextConnectorId }, logger) => { + logger.info( + { + nextConnectorId, + }, + "Starting backfill" + ); + + const connectors = await ConnectorResource.listByType("webcrawler", {}); + + // sort connectors by id and start from nextConnectorId + const sortedConnectors = connectors + .sort((a, b) => a.id - b.id) + .filter((_, idx) => idx >= nextConnectorId); + + for (const connector of sortedConnectors) { + const dataSourceConfig = dataSourceConfigFromConnector(connector); + const connectorId = connector.id; + + const folders = await WebCrawlerFolder.findAll({ + where: { + connectorId, + }, + }); + + const foldersByUrl = _.keyBy(folders, "url"); + + const getParents = (folder: WebCrawlerFolder): string[] => { + assert( + folder.parentUrl === null || foldersByUrl[folder.parentUrl], + "Parent folder not found" + ); + const parentFolder = folder.parentUrl + ? foldersByUrl[folder.parentUrl] + : null; + return [ + folder.internalId, + ...(parentFolder ? getParents(parentFolder) : []), + ]; + }; + await concurrentExecutor( + folders, + async (folder) => { + logger.info({ + folderId: folder.internalId, + folderUrl: folder.url, + execute, + }); + if (execute) { + const result = await upsertDataSourceFolder({ + dataSourceConfig, + folderId: folder.internalId, + timestampMs: folder.updatedAt.getTime(), + parents: getParents(folder), + title: folder.url, + mimeType: "application/vnd.dust.webcrawler.folder", + }); + logger.info({ + result, + folderId: folder.internalId, + folderUrl: folder.url, + }); + } + }, + { concurrency: 8 } + ); + } + } +); diff --git a/connectors/src/connectors/webcrawler/temporal/activities.ts b/connectors/src/connectors/webcrawler/temporal/activities.ts index 1c26d76ef531..cea57969e776 100644 --- a/connectors/src/connectors/webcrawler/temporal/activities.ts +++ b/connectors/src/connectors/webcrawler/temporal/activities.ts @@ -30,8 +30,10 @@ import { import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; import { deleteDataSourceDocument, + deleteDataSourceFolder, MAX_SMALL_DOCUMENT_TXT_LEN, upsertDataSourceDocument, + upsertDataSourceFolder, } from "@connectors/lib/data_sources"; import { WebCrawlerFolder, @@ -251,8 +253,13 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { totalExtracted += extracted.length; const pageTitle = $("title").text(); - const folders = getAllFoldersForUrl(request.url); - for (const folder of folders) { + // note that parentFolderUrls.length === parentFolderIds.length -1 + // since parentFolderIds includes the page as first element + // and parentFolderUrls does not + const parentFolderUrls = getAllFoldersForUrl(request.url); + const parentFolderIds = getParentsForPage(request.url, false); + + for (const [index, folder] of parentFolderUrls.entries()) { if (createdFolders.has(folder)) { continue; } @@ -260,7 +267,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { const logicalParent = isTopFolder(request.url) ? null : getFolderForUrl(folder); - await WebCrawlerFolder.upsert({ + const [webCrawlerFolder] = await WebCrawlerFolder.upsert({ url: folder, parentUrl: logicalParent, connectorId: connector.id, @@ -272,6 +279,19 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { lastSeenAt: new Date(), }); + await upsertDataSourceFolder({ + dataSourceConfig, + folderId: webCrawlerFolder.internalId, + timestampMs: webCrawlerFolder.updatedAt.getTime(), + + // parent folder ids of the page are in hierarchy order from the + // page to the root so for the current folder, its parents start at + // index+1 (including itself as first parent) and end at the root + parents: parentFolderIds.slice(index + 1), + title: folder, + mimeType: "application/vnd.dust.webcrawler.folder", + }); + createdFolders.add(folder); } const documentId = stableIdForUrl({ @@ -342,7 +362,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { documentUrl: validatedUrl.standardized, timestampMs: new Date().getTime(), tags: [`title:${stripNullBytes(pageTitle)}`], - parents: getParentsForPage(request.url, false), + parents: parentFolderIds, upsertContext: { sync_type: "batch", }, @@ -552,6 +572,10 @@ export async function webCrawlerGarbageCollector( type: "delete_folder", }); for (const folder of foldersToDelete) { + await deleteDataSourceFolder({ + dataSourceConfig, + folderId: folder.internalId, + }); await folder.destroy(); } } while (foldersToDelete.length > 0);