Skip to content

Commit

Permalink
[MS Connector] Compute parents array for document upsertion (#6233)
Browse files Browse the repository at this point in the history
* comments on internal id for microsoft

* exclude folders whose parents are already in sync list

* parents implementation

* clean

* fix filter/async

* review
  • Loading branch information
philipperolet authored and albandum committed Aug 28, 2024
1 parent 1e3cc71 commit 7199d4b
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 6 deletions.
28 changes: 28 additions & 0 deletions connectors/src/connectors/microsoft/lib/graph_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,16 @@ export function getDriveItemAPIPath(item: MicrosoftGraph.DriveItem) {
return `/drives/${parentReference.driveId}/items/${item.id}`;
}

export function getParentReferenceAPIPath(
parentReference: MicrosoftGraph.ItemReference
) {
if (!parentReference.driveId) {
throw new Error("Unexpected: no drive id for item");
}

return `/drives/${parentReference.driveId}/items/${parentReference.id}`;
}

export function getWorksheetAPIPath(
item: MicrosoftGraph.WorkbookWorksheet,
parentInternalId: string
Expand All @@ -391,6 +401,24 @@ export function getDriveAPIPath(drive: MicrosoftGraph.Drive) {
return `/drives/${drive.id}`;
}

export function getDriveAPIPathFromItem(item: MicrosoftGraph.DriveItem) {
if (!item.parentReference?.driveId) {
throw new Error("Unexpected: no drive id for item");
}

return `/drives/${item.parentReference.driveId}`;
}

export function getDriveItemAPIPathFromReference(
parentReference: MicrosoftGraph.ItemReference
) {
if (!parentReference.driveId) {
throw new Error("Unexpected: no drive id for item");
}

return `/drives/${parentReference.driveId}/items/${parentReference.id}`;
}

export function getSiteAPIPath(site: MicrosoftGraph.Site) {
return `/sites/${site.id}`;
}
7 changes: 7 additions & 0 deletions connectors/src/connectors/microsoft/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ export function isValidNodeType(
return MICROSOFT_NODE_TYPES.includes(nodeType as MicrosoftNodeType);
}

/* A specific situation for the Microsoft connector leads us to not use the
* externally provided id (although it exists and is unique), but to compute our
* own `internal id`. This is because the Microsoft API does not allow to query a document or
* list its children using its id alone. We compute an internal id that contains all
* information. More details
* [here](https://www.notion.so/dust-tt/Design-Doc-Microsoft-ids-parents-c27726652aae45abafaac587b971a41d?pvs=4)
*/
export type MicrosoftNode = {
nodeType: MicrosoftNodeType;
name: string | null;
Expand Down
144 changes: 140 additions & 4 deletions connectors/src/connectors/microsoft/temporal/activities.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import type { CoreAPIDataSourceDocumentSection, ModelId } from "@dust-tt/types";
import { cacheWithRedis } from "@dust-tt/types";
import type { Client } from "@microsoft/microsoft-graph-client";
import axios from "axios";
import mammoth from "mammoth";
import type { Logger } from "pino";
Expand All @@ -7,14 +9,17 @@ import turndown from "turndown";
import { getClient } from "@connectors/connectors/microsoft";
import {
getAllPaginatedEntities,
getDriveAPIPathFromItem,
getDriveItemAPIPath,
getDriveItemAPIPathFromReference,
getDrives,
getFilesAndFolders,
getItem,
getSiteAPIPath,
getSites,
internalIdFromTypeAndPath,
itemToMicrosoftNode,
typeAndPathFromInternalId,
} from "@connectors/connectors/microsoft/lib/graph_api";
import type { MicrosoftNode } from "@connectors/connectors/microsoft/lib/types";
import { getMimeTypesToSync } from "@connectors/connectors/microsoft/temporal/mime_types";
Expand Down Expand Up @@ -42,6 +47,7 @@ import {
import type { DataSourceConfig } from "@connectors/types/data_source_config";

const FILES_SYNC_CONCURRENCY = 10;
const PARENT_SYNC_CACHE_TTL_MS = 10 * 60 * 1000;

export async function getSiteNodesToSync(
connectorId: ModelId
Expand Down Expand Up @@ -117,14 +123,88 @@ export async function getSiteNodesToSync(
[] as MicrosoftNode[]
);

// for all folders, check if a parent folder or drive is already in the list,
// in which case remove it. This can happen because when a user selects a
// folder to sync, then a parent folder, both are stored in Microsoft Roots
// table

// Keeping them both in the sync list can result in various kinds of issues,
// e.g. if a child folder is synced before the parent, then the child folder's
// files' parents array will be incomplete, thus the need to prune the list
const nodesToSync = [];
for (const node of allNodes) {
if (
!(
node.nodeType === "folder" &&
(await isParentAlreadyInNodes({
client,
nodes: allNodes,
folder: node,
}))
)
) {
nodesToSync.push(node);
}
}

const nodeResources = await MicrosoftNodeResource.batchUpdateOrCreate(
connectorId,
allNodes
nodesToSync
);

return nodeResources.map((r) => r.internalId);
}

async function isParentAlreadyInNodes({
client,
nodes,
folder,
}: {
client: Client;
nodes: MicrosoftNode[];
folder: MicrosoftNode;
}) {
const { itemAPIPath } = typeAndPathFromInternalId(folder.internalId);
let driveItem: microsoftgraph.DriveItem = await getItem(client, itemAPIPath);

// check if the list already contains the drive of this folder
if (
nodes.some(
(node) =>
node.internalId ===
internalIdFromTypeAndPath({
nodeType: "drive",
itemAPIPath: getDriveAPIPathFromItem(driveItem),
})
)
) {
return true;
}

// check if the list already contains any parent of this folder
while (!driveItem.root) {
if (!driveItem.parentReference) {
return false;
}

const parentAPIPath = getDriveItemAPIPathFromReference(
driveItem.parentReference
);

const parentInternalId = internalIdFromTypeAndPath({
nodeType: "folder",
itemAPIPath: parentAPIPath,
});

if (nodes.some((node) => node.internalId === parentInternalId)) {
return true;
}

driveItem = await getItem(client, parentAPIPath);
}
return false;
}

export async function markNodeAsVisited(
connectorId: ModelId,
internalId: string
Expand Down Expand Up @@ -457,9 +537,12 @@ export async function syncOneFile({

const isInSizeRange = documentLength > 0 && documentLength < maxDocumentLen;
if (isInSizeRange) {
// TODO(pr): add getParents implementation
const parents = [];
parents.push(documentId);
const parents = await getParents({
connectorId,
internalId: documentId,
parentInternalId,
startSyncTs,
});
parents.reverse();

await upsertToDatasource({
Expand Down Expand Up @@ -505,6 +588,59 @@ export async function syncOneFile({
return isInSizeRange;
}

async function getParents({
connectorId,
internalId,
parentInternalId,
startSyncTs,
}: {
connectorId: ModelId;
internalId: string;
parentInternalId: string | null;
startSyncTs: number;
}): Promise<string[]> {
if (!parentInternalId) {
return [internalId];
}

const parentParentInternalId = await getParentParentId(
connectorId,
parentInternalId,
startSyncTs
);

return [
internalId,
...(await getParents({
connectorId,
internalId: parentInternalId,
parentInternalId: parentParentInternalId,
startSyncTs,
})),
];
}

/* Fetching parent's parent id queries the db for a resource; since those
* fetches can be made a lot of times during a sync, cache for 10mins in a
* per-sync basis (given by startSyncTs) */
const getParentParentId = cacheWithRedis(
// eslint-disable-next-line @typescript-eslint/no-unused-vars
async (connectorId, parentInternalId, startSyncTs) => {
const parent = await MicrosoftNodeResource.fetchByInternalId(
connectorId,
parentInternalId
);
if (!parent) {
throw new Error(`Parent node not found: ${parentInternalId}`);
}

return parent.parentInternalId;
},
(connectorId, parentInternalId, startSyncTs) =>
`microsoft-${connectorId}-parent-${parentInternalId}-syncms-${startSyncTs}`,
PARENT_SYNC_CACHE_TTL_MS
);

async function handlePptxFile(
data: ArrayBuffer,
fileId: string | undefined,
Expand Down
9 changes: 9 additions & 0 deletions core/src/data_sources/data_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,15 @@ pub struct Chunk {
/// considered a parent, and has a proper external “repo id”, which is stored at
/// 2nd place in the array
///
/// Additional note: in cases where selection of elements to sync is done on
/// Dust side and not on provider's side, we need to be able to list all the
/// children of a resource given its id (this is the case for google drive and
/// microsoft for instance). While google drive's API and ids allows it, this is
/// not the case for Microsoft. Therefore, for microsoft, instead of using the
/// provider id directly, we compute our own document id containing all
/// information for the querying the document using Microsoft's API. More details
/// [here](https://www.notion.so/dust-tt/Design-Doc-Microsoft-ids-parents-c27726652aae45abafaac587b971a41d?pvs=4)
///
/// Parents array
/// -------------
/// At index 0 is the string id of the document itself, then at index 1 its
Expand Down
10 changes: 8 additions & 2 deletions types/src/front/lib/connectors_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,14 @@ export type ContentNodeType = "file" | "folder" | "database" | "channel";
* provided id when possible (e.g. Notion page id, Github repository id,
* etc...). When not possible, such as for Github issues whose id is not
* workspace-unique, a custom function to create a unique id is created, and
* used both in the parents field management code and the connectors node
* code.
* used both in the parents field management code and the connectors node code.
*
* A specific situation for the Microsoft connector leads us to not use the
* externally provided id (although it exists and is unique), but to compute our
* own. This is because the Microsoft API does not allow to query a document or
* list its children using its id alone. We compute an internal id that contains all
* information. More details here:
* https://www.notion.so/dust-tt/Design-Doc-Microsoft-ids-parents-c27726652aae45abafaac587b971a41d?pvs=4
*/
export type ContentNode = {
provider: ConnectorProvider;
Expand Down

0 comments on commit 7199d4b

Please sign in to comment.