From 2a10253f401177c224b532d566a1d63222901f23 Mon Sep 17 00:00:00 2001 From: filou Date: Mon, 15 Jul 2024 17:00:23 +0200 Subject: [PATCH 1/6] comments on internal id for microsoft --- connectors/src/connectors/microsoft/lib/types.ts | 7 +++++++ core/src/data_sources/data_source.rs | 9 +++++++++ types/src/front/lib/connectors_api.ts | 10 ++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/connectors/src/connectors/microsoft/lib/types.ts b/connectors/src/connectors/microsoft/lib/types.ts index 151e4352d091..e76e258af759 100644 --- a/connectors/src/connectors/microsoft/lib/types.ts +++ b/connectors/src/connectors/microsoft/lib/types.ts @@ -19,6 +19,13 @@ export function isValidNodeType( return MICROSOFT_NODE_TYPES.includes(nodeType as MicrosoftNodeType); } +/* A specific situation for the Microsoft connector leads us to not use the + * externally provided id (although it exists and is unique), but to compute our + * own `internal id`. This is because the Microsoft API does not allow to query a document or + * list its children using its id alone. We compute an internal id that contains all + * information. More details + * [here](https://www.notion.so/dust-tt/Design-Doc-Microsoft-ids-parents-c27726652aae45abafaac587b971a41d?pvs=4) + */ export type MicrosoftNode = { nodeType: MicrosoftNodeType; name: string | null; diff --git a/core/src/data_sources/data_source.rs b/core/src/data_sources/data_source.rs index 51de374b4681..aa98bdd55d7f 100644 --- a/core/src/data_sources/data_source.rs +++ b/core/src/data_sources/data_source.rs @@ -326,6 +326,15 @@ pub struct Chunk { /// considered a parent, and has a proper external “repo id”, which is stored at /// 2nd place in the array /// +/// Additional note: in cases where selection of elements to sync is done on +/// Dust side and not on provider's side, we need to be able to list all the +/// children of a resource given its id (this is the case for google drive and +/// microsoft for instance). While google drive's API and ids allows it, this is +/// not the case for Microsoft. Therefore, for microsoft, instead of using the +/// provider id directly, we compute our own document id containing all +/// information for the querying the document using Microsoft's API. More details +/// [here](https://www.notion.so/dust-tt/Design-Doc-Microsoft-ids-parents-c27726652aae45abafaac587b971a41d?pvs=4) +/// /// Parents array /// ------------- /// At index 0 is the string id of the document itself, then at index 1 its diff --git a/types/src/front/lib/connectors_api.ts b/types/src/front/lib/connectors_api.ts index 51a18a5d21cd..59b0d540c294 100644 --- a/types/src/front/lib/connectors_api.ts +++ b/types/src/front/lib/connectors_api.ts @@ -86,8 +86,14 @@ export type ContentNodeType = "file" | "folder" | "database" | "channel"; * provided id when possible (e.g. Notion page id, Github repository id, * etc...). When not possible, such as for Github issues whose id is not * workspace-unique, a custom function to create a unique id is created, and - * used both in the parents field management code and the connectors node - * code. + * used both in the parents field management code and the connectors node code. + * + * A specific situation for the Microsoft connector leads us to not use the + * externally provided id (although it exists and is unique), but to compute our + * own. This is because the Microsoft API does not allow to query a document or + * list its children using its id alone. We compute an internal id that contains all + * information. More details here: + * https://www.notion.so/dust-tt/Design-Doc-Microsoft-ids-parents-c27726652aae45abafaac587b971a41d?pvs=4 */ export type ContentNode = { provider: ConnectorProvider; From f2eda447a5a8ab6854a79169cdf389a4150dc2eb Mon Sep 17 00:00:00 2001 From: filou Date: Mon, 15 Jul 2024 21:58:09 +0200 Subject: [PATCH 2/6] exclude folders whose parents are already in sync list --- .../src/connectors/microsoft/lib/graph_api.ts | 28 +++++++ .../microsoft/temporal/activities.ts | 76 ++++++++++++++++++- 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/connectors/src/connectors/microsoft/lib/graph_api.ts b/connectors/src/connectors/microsoft/lib/graph_api.ts index f9ce60fa0a08..74b79a6971da 100644 --- a/connectors/src/connectors/microsoft/lib/graph_api.ts +++ b/connectors/src/connectors/microsoft/lib/graph_api.ts @@ -373,6 +373,16 @@ export function getDriveItemAPIPath(item: MicrosoftGraph.DriveItem) { return `/drives/${parentReference.driveId}/items/${item.id}`; } +export function getParentReferenceAPIPath( + parentReference: MicrosoftGraph.ItemReference +) { + if (!parentReference.driveId) { + throw new Error("Unexpected: no drive id for item"); + } + + return `/drives/${parentReference.driveId}/items/${parentReference.id}`; +} + export function getWorksheetAPIPath( item: MicrosoftGraph.WorkbookWorksheet, parentInternalId: string @@ -391,6 +401,24 @@ export function getDriveAPIPath(drive: MicrosoftGraph.Drive) { return `/drives/${drive.id}`; } +export function getDriveAPIPathFromItem(item: MicrosoftGraph.DriveItem) { + if (!item.parentReference?.driveId) { + throw new Error("Unexpected: no drive id for item"); + } + + return `/drives/${item.parentReference.driveId}`; +} + +export function getDriveItemAPIPathFromReference( + parentReference: MicrosoftGraph.ItemReference +) { + if (!parentReference.driveId) { + throw new Error("Unexpected: no drive id for item"); + } + + return `/drives/${parentReference.driveId}/items/${parentReference.id}`; +} + export function getSiteAPIPath(site: MicrosoftGraph.Site) { return `/sites/${site.id}`; } diff --git a/connectors/src/connectors/microsoft/temporal/activities.ts b/connectors/src/connectors/microsoft/temporal/activities.ts index eedb7e3fba1c..2e62358715cd 100644 --- a/connectors/src/connectors/microsoft/temporal/activities.ts +++ b/connectors/src/connectors/microsoft/temporal/activities.ts @@ -7,7 +7,9 @@ import turndown from "turndown"; import { getClient } from "@connectors/connectors/microsoft"; import { getAllPaginatedEntities, + getDriveAPIPathFromItem, getDriveItemAPIPath, + getDriveItemAPIPathFromReference, getDrives, getFilesAndFolders, getItem, @@ -15,6 +17,7 @@ import { getSites, internalIdFromTypeAndPath, itemToMicrosoftNode, + typeAndPathFromInternalId, } from "@connectors/connectors/microsoft/lib/graph_api"; import type { MicrosoftNode } from "@connectors/connectors/microsoft/lib/types"; import { getMimeTypesToSync } from "@connectors/connectors/microsoft/temporal/mime_types"; @@ -40,6 +43,7 @@ import { MicrosoftRootResource, } from "@connectors/resources/microsoft_resource"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; +import { Client } from "@microsoft/microsoft-graph-client"; const FILES_SYNC_CONCURRENCY = 10; @@ -117,14 +121,84 @@ export async function getSiteNodesToSync( [] as MicrosoftNode[] ); + // for all folders, check if a parent folder or drive is already in the list, + // in which case remove it this can happen because when a user selects a + // folder to sync, then a parent folder, both are storeed in Microsoft Roots + // table + + // Keeping them both in the sync list can result in various kinds of issues, + // e.g. if a child folder is synced before the parent, then the child folder's + // files' parents array will be incomplete, thus the need to prune the list + const nodesToSync = allNodes.filter( + async (node) => + !( + node.nodeType === "folder" && + (await isParentAlreadyInNodes({ + client, + nodes: allNodes, + folder: node, + })) + ) + ); + const nodeResources = await MicrosoftNodeResource.batchUpdateOrCreate( connectorId, - allNodes + nodesToSync ); return nodeResources.map((r) => r.internalId); } +async function isParentAlreadyInNodes({ + client, + nodes, + folder, +}: { + client: Client; + nodes: MicrosoftNode[]; + folder: MicrosoftNode; +}) { + const { itemAPIPath } = typeAndPathFromInternalId(folder.internalId); + let driveItem: microsoftgraph.DriveItem = await getItem(client, itemAPIPath); + + // check if the list already contains the drive of this folder + if ( + nodes.some( + (node) => + node.internalId === + internalIdFromTypeAndPath({ + nodeType: "drive", + itemAPIPath: getDriveAPIPathFromItem(driveItem), + }) + ) + ) { + return true; + } + + // check if the list already contains any parent of this folder + while (!driveItem.root) { + if (!driveItem.parentReference) { + return false; + } + + const parentAPIPath = getDriveItemAPIPathFromReference( + driveItem.parentReference + ); + + const parentInternalId = internalIdFromTypeAndPath({ + nodeType: "folder", + itemAPIPath: parentAPIPath, + }); + + if (nodes.some((node) => node.internalId === parentInternalId)) { + return true; + } + + driveItem = await getItem(client, parentAPIPath); + } + return false; +} + export async function markNodeAsVisited( connectorId: ModelId, internalId: string From cef2e5fbb599eb56f96db19161420f96bc762fe0 Mon Sep 17 00:00:00 2001 From: filou Date: Mon, 15 Jul 2024 22:18:18 +0200 Subject: [PATCH 3/6] parents implementation --- .../microsoft/temporal/activities.ts | 62 ++++++++++++++++++- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/connectors/src/connectors/microsoft/temporal/activities.ts b/connectors/src/connectors/microsoft/temporal/activities.ts index 2e62358715cd..fe394783cf7c 100644 --- a/connectors/src/connectors/microsoft/temporal/activities.ts +++ b/connectors/src/connectors/microsoft/temporal/activities.ts @@ -44,6 +44,7 @@ import { } from "@connectors/resources/microsoft_resource"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; import { Client } from "@microsoft/microsoft-graph-client"; +import { start } from "repl"; const FILES_SYNC_CONCURRENCY = 10; @@ -531,9 +532,12 @@ export async function syncOneFile({ const isInSizeRange = documentLength > 0 && documentLength < maxDocumentLen; if (isInSizeRange) { - // TODO(pr): add getParents implementation - const parents = []; - parents.push(documentId); + const parents = await getParents({ + connectorId, + internalId: documentId, + parentInternalId, + startSyncTs, + }); parents.reverse(); await upsertToDatasource({ @@ -579,6 +583,58 @@ export async function syncOneFile({ return isInSizeRange; } +async function getParents({ + connectorId, + internalId, + parentInternalId, + startSyncTs, +}: { + connectorId: ModelId; + internalId: string; + parentInternalId: string | null; + startSyncTs: number; +}): Promise { + if (!parentInternalId) { + return [internalId]; + } + + const parentParentInternalId = await getParentParentId( + connectorId, + parentInternalId, + startSyncTs + ); + + return [ + internalId, + ...(await getParents({ + connectorId, + internalId: parentInternalId, + parentInternalId: parentParentInternalId, + startSyncTs, + })), + ]; +} + +/* Fetching parent's parent id queries the db for a resource; since those + * fetches can be made a lot of times during a sync, cache for 10mins in a + * per-sync basis (given by startSyncTs) */ +const getParentParentId = cacheWithRedis( + async (connectorId, parentInternalId, startSyncTs) => { + const parent = await MicrosoftNodeResource.fetchByInternalId( + connectorId, + parentInternalId + ); + if (!parent) { + throw new Error(`Parent node not found: ${parentInternalId}`); + } + + return parent.parentInternalId; + }, + (connectorId, parentInternalId, startSyncTs) => + `microsoft-${connectorId}-parent-${parentInternalId}-syncms-${startSyncTs}`, + 10 * 60 * 1000 +); + async function handlePptxFile( data: ArrayBuffer, fileId: string | undefined, From 560cc45362c5d4d959831c95e032cef63b029667 Mon Sep 17 00:00:00 2001 From: filou Date: Mon, 15 Jul 2024 22:29:10 +0200 Subject: [PATCH 4/6] clean --- connectors/src/connectors/microsoft/temporal/activities.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/connectors/src/connectors/microsoft/temporal/activities.ts b/connectors/src/connectors/microsoft/temporal/activities.ts index fe394783cf7c..ce4ae3df73c5 100644 --- a/connectors/src/connectors/microsoft/temporal/activities.ts +++ b/connectors/src/connectors/microsoft/temporal/activities.ts @@ -1,4 +1,6 @@ import type { CoreAPIDataSourceDocumentSection, ModelId } from "@dust-tt/types"; +import { cacheWithRedis } from "@dust-tt/types"; +import type { Client } from "@microsoft/microsoft-graph-client"; import axios from "axios"; import mammoth from "mammoth"; import type { Logger } from "pino"; @@ -43,8 +45,6 @@ import { MicrosoftRootResource, } from "@connectors/resources/microsoft_resource"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; -import { Client } from "@microsoft/microsoft-graph-client"; -import { start } from "repl"; const FILES_SYNC_CONCURRENCY = 10; @@ -619,6 +619,7 @@ async function getParents({ * fetches can be made a lot of times during a sync, cache for 10mins in a * per-sync basis (given by startSyncTs) */ const getParentParentId = cacheWithRedis( + // eslint-disable-next-line @typescript-eslint/no-unused-vars async (connectorId, parentInternalId, startSyncTs) => { const parent = await MicrosoftNodeResource.fetchByInternalId( connectorId, From 04d09b38be7b8c0954731e5cc16d06b631005ff2 Mon Sep 17 00:00:00 2001 From: filou Date: Tue, 16 Jul 2024 14:34:37 +0200 Subject: [PATCH 5/6] fix filter/async --- .../src/connectors/microsoft/temporal/activities.ts | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/connectors/src/connectors/microsoft/temporal/activities.ts b/connectors/src/connectors/microsoft/temporal/activities.ts index ce4ae3df73c5..db5e1ca80701 100644 --- a/connectors/src/connectors/microsoft/temporal/activities.ts +++ b/connectors/src/connectors/microsoft/temporal/activities.ts @@ -130,8 +130,9 @@ export async function getSiteNodesToSync( // Keeping them both in the sync list can result in various kinds of issues, // e.g. if a child folder is synced before the parent, then the child folder's // files' parents array will be incomplete, thus the need to prune the list - const nodesToSync = allNodes.filter( - async (node) => + const nodesToSync = []; + for (const node of allNodes) { + if ( !( node.nodeType === "folder" && (await isParentAlreadyInNodes({ @@ -140,7 +141,10 @@ export async function getSiteNodesToSync( folder: node, })) ) - ); + ) { + nodesToSync.push(node); + } + } const nodeResources = await MicrosoftNodeResource.batchUpdateOrCreate( connectorId, From fc04f8a3dd13256919c4734789409b1503df1201 Mon Sep 17 00:00:00 2001 From: filou Date: Tue, 16 Jul 2024 14:54:06 +0200 Subject: [PATCH 6/6] review --- connectors/src/connectors/microsoft/temporal/activities.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/connectors/src/connectors/microsoft/temporal/activities.ts b/connectors/src/connectors/microsoft/temporal/activities.ts index db5e1ca80701..dd6e13d92530 100644 --- a/connectors/src/connectors/microsoft/temporal/activities.ts +++ b/connectors/src/connectors/microsoft/temporal/activities.ts @@ -47,6 +47,7 @@ import { import type { DataSourceConfig } from "@connectors/types/data_source_config"; const FILES_SYNC_CONCURRENCY = 10; +const PARENT_SYNC_CACHE_TTL_MS = 10 * 60 * 1000; export async function getSiteNodesToSync( connectorId: ModelId @@ -123,8 +124,8 @@ export async function getSiteNodesToSync( ); // for all folders, check if a parent folder or drive is already in the list, - // in which case remove it this can happen because when a user selects a - // folder to sync, then a parent folder, both are storeed in Microsoft Roots + // in which case remove it. This can happen because when a user selects a + // folder to sync, then a parent folder, both are stored in Microsoft Roots // table // Keeping them both in the sync list can result in various kinds of issues, @@ -637,7 +638,7 @@ const getParentParentId = cacheWithRedis( }, (connectorId, parentInternalId, startSyncTs) => `microsoft-${connectorId}-parent-${parentInternalId}-syncms-${startSyncTs}`, - 10 * 60 * 1000 + PARENT_SYNC_CACHE_TTL_MS ); async function handlePptxFile(