-
Notifications
You must be signed in to change notification settings - Fork 112
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Keyword search] Webcrawler folders update & backfill #9515
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import assert from "node:assert"; | ||
|
||
import { concurrentExecutor } from "@dust-tt/types"; | ||
import _ from "lodash"; | ||
import { makeScript } from "scripts/helpers"; | ||
|
||
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; | ||
import { upsertDataSourceFolder } from "@connectors/lib/data_sources"; | ||
import { WebCrawlerFolder } from "@connectors/lib/models/webcrawler"; | ||
import { ConnectorResource } from "@connectors/resources/connector_resource"; | ||
|
||
makeScript( | ||
{ | ||
nextConnectorId: { | ||
type: "number", | ||
required: false, | ||
default: 0, | ||
}, | ||
connectorId: { | ||
type: "number", | ||
required: false, | ||
default: 0, | ||
}, | ||
}, | ||
async ({ execute, nextConnectorId }, logger) => { | ||
logger.info( | ||
{ | ||
nextConnectorId, | ||
}, | ||
"Starting backfill" | ||
); | ||
|
||
const connectors = await ConnectorResource.listByType("webcrawler", {}); | ||
|
||
// sort connectors by id and start from nextConnectorId | ||
const sortedConnectors = connectors | ||
.sort((a, b) => a.id - b.id) | ||
.filter((_, idx) => idx >= nextConnectorId); | ||
|
||
for (const connector of sortedConnectors) { | ||
const dataSourceConfig = dataSourceConfigFromConnector(connector); | ||
const connectorId = connector.id; | ||
|
||
const folders = await WebCrawlerFolder.findAll({ | ||
where: { | ||
connectorId, | ||
}, | ||
}); | ||
|
||
const foldersByUrl = _.keyBy(folders, "url"); | ||
|
||
const getParents = (folder: WebCrawlerFolder): string[] => { | ||
assert( | ||
folder.parentUrl === null || foldersByUrl[folder.parentUrl], | ||
"Parent folder not found" | ||
); | ||
const parentFolder = folder.parentUrl | ||
? foldersByUrl[folder.parentUrl] | ||
: null; | ||
return [ | ||
folder.internalId, | ||
...(parentFolder ? getParents(parentFolder) : []), | ||
]; | ||
}; | ||
await concurrentExecutor( | ||
folders, | ||
async (folder) => { | ||
logger.info({ | ||
folderId: folder.internalId, | ||
folderUrl: folder.url, | ||
execute, | ||
}); | ||
if (execute) { | ||
const result = await upsertDataSourceFolder({ | ||
dataSourceConfig, | ||
folderId: folder.internalId, | ||
timestampMs: folder.updatedAt.getTime(), | ||
parents: getParents(folder), | ||
title: folder.url, | ||
mimeType: "application/vnd.dust.webcrawler.folder", | ||
}); | ||
logger.info({ | ||
result, | ||
folderId: folder.internalId, | ||
folderUrl: folder.url, | ||
}); | ||
} | ||
}, | ||
{ concurrency: 8 } | ||
); | ||
} | ||
} | ||
); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,8 +30,10 @@ import { | |
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config"; | ||
import { | ||
deleteDataSourceDocument, | ||
deleteDataSourceFolder, | ||
MAX_SMALL_DOCUMENT_TXT_LEN, | ||
upsertDataSourceDocument, | ||
upsertDataSourceFolder, | ||
} from "@connectors/lib/data_sources"; | ||
import { | ||
WebCrawlerFolder, | ||
|
@@ -251,16 +253,21 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { | |
totalExtracted += extracted.length; | ||
const pageTitle = $("title").text(); | ||
|
||
const folders = getAllFoldersForUrl(request.url); | ||
for (const folder of folders) { | ||
// note that parentFolderUrls.length === parentFolderIds.length -1 | ||
// since parentFolderIds includes the page as first element | ||
// and parentFolderUrls does not | ||
const parentFolderUrls = getAllFoldersForUrl(request.url); | ||
const parentFolderIds = getParentsForPage(request.url, false); | ||
|
||
for (const [index, folder] of parentFolderUrls.entries()) { | ||
if (createdFolders.has(folder)) { | ||
continue; | ||
} | ||
|
||
const logicalParent = isTopFolder(request.url) | ||
? null | ||
: getFolderForUrl(folder); | ||
await WebCrawlerFolder.upsert({ | ||
const [webCrawlerFolder] = await WebCrawlerFolder.upsert({ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: in the internalId passed here I don't love the fact that we call stableIdForUrl again since we could get it from getParentsForPage (in the original code) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair point, but the alternative There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you extracted the parents in a variable and do parents[0] that would make of both worlds no? (Now possible because we do the slice) |
||
url: folder, | ||
parentUrl: logicalParent, | ||
connectorId: connector.id, | ||
|
@@ -272,6 +279,19 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { | |
lastSeenAt: new Date(), | ||
}); | ||
|
||
await upsertDataSourceFolder({ | ||
dataSourceConfig, | ||
folderId: webCrawlerFolder.internalId, | ||
timestampMs: webCrawlerFolder.updatedAt.getTime(), | ||
|
||
// parent folder ids of the page are in hierarchy order from the | ||
// page to the root so for the current folder, its parents start at | ||
// index+1 (including itself as first parent) and end at the root | ||
parents: parentFolderIds.slice(index + 1), | ||
aubin-tchoi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
title: folder, | ||
mimeType: "application/vnd.dust.webcrawler.folder", | ||
}); | ||
|
||
createdFolders.add(folder); | ||
} | ||
const documentId = stableIdForUrl({ | ||
|
@@ -342,7 +362,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) { | |
documentUrl: validatedUrl.standardized, | ||
timestampMs: new Date().getTime(), | ||
tags: [`title:${stripNullBytes(pageTitle)}`], | ||
parents: getParentsForPage(request.url, false), | ||
parents: parentFolderIds, | ||
upsertContext: { | ||
sync_type: "batch", | ||
}, | ||
|
@@ -552,6 +572,10 @@ export async function webCrawlerGarbageCollector( | |
type: "delete_folder", | ||
}); | ||
for (const folder of foldersToDelete) { | ||
await deleteDataSourceFolder({ | ||
dataSourceConfig, | ||
folderId: folder.internalId, | ||
}); | ||
await folder.destroy(); | ||
} | ||
} while (foldersToDelete.length > 0); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I'd log the result of getParents here to make sure nothing's going seriously wrong