From 60f48a50e435dc155f139f9e3e009d8d80e1d242 Mon Sep 17 00:00:00 2001 From: Jules Belveze <32683010+JulesBelveze@users.noreply.github.com> Date: Fri, 19 Jul 2024 13:47:29 +0200 Subject: [PATCH] [connectors] - fix(microsoft): data sour selection tree (#6321) * [connectors/microsoft] - feature: extend Microsoft connector to include file nodes - Add support for listing file nodes alongside folder nodes in Microsoft connector responses - Implement permission filtering logic for both files and folders when constructing content nodes - Create a `getFileAsContentNode` function for mapping Microsoft DriveItem to ContentNode for files - Enhance permission mapping to account for parent permissions when assigning to content nodes * [connectors/microsoft] - feature: enhance node retrieval with read permission filter - Added support to use the "read" permission filter when retrieving child nodes - Removed getFileAsContentNode call and integrated getMicrosoftNodeAsContentNode for node retrieval - Implemented a new function to get content nodes from MicrosoftNodeModel - Refactored node type determination logic into getMicrosoftNodeAsContentNode function - Removed conditional file handling related to permission filtering within the content node retrieval process * [connectors] - feature: expand retrieval of child nodes with content inclusion flag - Add an additional flag to the `retrieveChildrenNodes` function to control the inclusion of additional content - Expose `retrieveChildrenNodes` function for potential external use by making it exportable * [connectors] - refactor: remove redundant fetch parameter in Microsoft connector - Simplified the node fetching logic by removing an unnecessary boolean parameter * fix: lint/format * [connectors] - feature: enhance filtering for children nodes retrieval in Microsoft connector - Include a new condition to filter only 'file', 'folder', and 'drive' node types when retrieving children nodes - Utilize Sequelize's Op.in to specify the multiple nodeType values in the query condition * [microsoft/temporal] - fix: adjust spreadsheet syncing logic - Remove early return for spreadsheet mimeType to align with new sync logic - Pass `parentInternalId` when syncing spreadsheets and worksheets to maintain hierarchy - Ensure spreadsheets have the `parentInternalId` when upserted to the database for relational integrity * [connectors/microsoft] - refactor: streamline children node retrieval in MicrosoftConnector - Replace direct usage of `MicrosoftNodeModel` with `MicrosoftNodeResource` - Introduce method `fetchNodesWithoutParents` to get nodes without parent references - Create `fetchChildren` method in `MicrosoftNodeResource` to encapsulate children node fetching logic - Adjust `retrieveChildrenNodes` function to utilize new `MicrosoftNodeResource` methods for improved readability and maintainability --------- Co-authored-by: Jules Co-authored-by: Jules --- connectors/src/connectors/microsoft/index.ts | 61 +++++++++++++++---- .../connectors/microsoft/lib/content_nodes.ts | 56 ++++++++++++++++- .../microsoft/temporal/activities.ts | 8 +-- .../microsoft/temporal/spreadsheets.ts | 25 ++++++-- .../src/resources/microsoft_resource.ts | 36 +++++++++-- 5 files changed, 155 insertions(+), 31 deletions(-) diff --git a/connectors/src/connectors/microsoft/index.ts b/connectors/src/connectors/microsoft/index.ts index a2c03ae57a89..2a2e4ec482f9 100644 --- a/connectors/src/connectors/microsoft/index.ts +++ b/connectors/src/connectors/microsoft/index.ts @@ -15,6 +15,7 @@ import { getChannelAsContentNode, getDriveAsContentNode, getFolderAsContentNode, + getMicrosoftNodeAsContentNode, getSiteAsContentNode, getTeamAsContentNode, } from "@connectors/connectors/microsoft/lib/content_nodes"; @@ -40,6 +41,7 @@ import logger from "@connectors/logger/logger"; import { ConnectorResource } from "@connectors/resources/connector_resource"; import { MicrosoftConfigurationResource, + MicrosoftNodeResource, MicrosoftRootResource, } from "@connectors/resources/microsoft_resource"; import type { DataSourceConfig } from "@connectors/types/data_source_config"; @@ -168,6 +170,24 @@ export class MicrosoftConnectorManager extends BaseConnectorManager { ); } + if (filterPermission === "read") { + if (!parentInternalId) { + const nodes = await MicrosoftNodeResource.fetchNodesWithoutParents(); + return new Ok( + nodes.map((node) => getMicrosoftNodeAsContentNode(node, false)) + ); + } + const node = await MicrosoftNodeResource.fetchByInternalId( + this.connectorId, + parentInternalId + ); + if (!node) { + return new Err( + new Error(`Could not find node with id ${parentInternalId}`) + ); + } + return retrieveChildrenNodes(node, false); + } const client = await getClient(connector.connectionId); const nodes = []; @@ -227,11 +247,10 @@ export class MicrosoftConnectorManager extends BaseConnectorManager { } case "drive": case "folder": { - const folders = ( - await getAllPaginatedEntities((nextLink) => - getFilesAndFolders(client, parentInternalId, nextLink) - ) - ).filter((n) => n.folder); + const filesAndFolders = await getAllPaginatedEntities((nextLink) => + getFilesAndFolders(client, parentInternalId, nextLink) + ); + const folders = filesAndFolders.filter((n) => n.folder); nodes.push( ...folders.map((n) => getFolderAsContentNode(n, parentInternalId)) ); @@ -250,12 +269,16 @@ export class MicrosoftConnectorManager extends BaseConnectorManager { } } - const nodesWithPermissions = nodes.map((res) => ({ - ...res, - permission: (selectedResources.includes(res.internalId) - ? "read" - : "none") as ConnectorPermission, - })); + const nodesWithPermissions = nodes.map((res) => { + return { + ...res, + permission: (selectedResources.includes(res.internalId) || + (res.parentInternalId && + selectedResources.includes(res.parentInternalId)) + ? "read" + : "none") as ConnectorPermission, + }; + }); if (filterPermission) { return new Ok( @@ -495,3 +518,19 @@ export async function getClient(connectionId: NangoConnectionId) { authProvider: (done) => done(null, msAccessToken), }); } + +export async function retrieveChildrenNodes( + microsoftNode: MicrosoftNodeResource, + expandWorksheet: boolean +): Promise> { + const childrenNodes = await microsoftNode.fetchChildren([ + "file", + "folder", + "drive", + ]); + return new Ok( + childrenNodes.map((node) => + getMicrosoftNodeAsContentNode(node, expandWorksheet) + ) + ); +} diff --git a/connectors/src/connectors/microsoft/lib/content_nodes.ts b/connectors/src/connectors/microsoft/lib/content_nodes.ts index f34e3e1376b3..0cd653a076a6 100644 --- a/connectors/src/connectors/microsoft/lib/content_nodes.ts +++ b/connectors/src/connectors/microsoft/lib/content_nodes.ts @@ -1,4 +1,4 @@ -import type { ContentNode } from "@dust-tt/types"; +import type { ContentNode, ContentNodeType } from "@dust-tt/types"; import { getDriveInternalId, @@ -7,6 +7,7 @@ import { internalIdFromTypeAndPath, typeAndPathFromInternalId, } from "@connectors/connectors/microsoft/lib/graph_api"; +import type { MicrosoftNodeResource } from "@connectors/resources/microsoft_resource"; export function getRootNodes(): ContentNode[] { return [getSitesRootAsContentNode(), getTeamsRootAsContentNode()]; @@ -155,3 +156,56 @@ export function getFolderAsContentNode( permission: "none", }; } + +export function getFileAsContentNode( + file: microsoftgraph.DriveItem, + parentInternalId: string +): ContentNode { + return { + provider: "microsoft", + internalId: getDriveItemInternalId(file), + parentInternalId, + type: "file", + title: file.name || "unnamed", + sourceUrl: "", + dustDocumentId: null, + lastUpdatedAt: null, + expandable: false, + permission: "none", + }; +} + +export function getMicrosoftNodeAsContentNode( + node: MicrosoftNodeResource, + expandWorksheet: boolean +): ContentNode { + // When table picking we want spreadsheets to expand to select the different + // sheets. While extracting data we want to treat them as regular files. + const isExpandable = + !node.mimeType || + (node.mimeType === + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" && + expandWorksheet); + let type: ContentNodeType; + if (["drive", "folder"].includes(node.nodeType)) { + type = "folder"; + } else if (node.nodeType === "worksheet") { + type = expandWorksheet ? "database" : "file"; + } else if (node.nodeType === "file") { + type = node.nodeType; + } else { + throw new Error(`Unsupported nodeType ${node.nodeType}.`); + } + return { + provider: "microsoft", + internalId: node.internalId, + parentInternalId: node.parentInternalId, + type, + title: node.name || "unnamed", + sourceUrl: "", + dustDocumentId: null, + lastUpdatedAt: null, + expandable: isExpandable, + permission: "none", + }; +} diff --git a/connectors/src/connectors/microsoft/temporal/activities.ts b/connectors/src/connectors/microsoft/temporal/activities.ts index b681b2ea6ce6..58c3c4850652 100644 --- a/connectors/src/connectors/microsoft/temporal/activities.ts +++ b/connectors/src/connectors/microsoft/temporal/activities.ts @@ -472,13 +472,6 @@ export async function syncOneFile({ return false; } - if ( - mimeType === - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - ) { - return (await syncSpreadSheet({ connectorId, file })).isSupported; - } - const maxDocumentLen = providerConfig.largeFilesEnabled ? MAX_LARGE_DOCUMENT_TXT_LEN : MAX_DOCUMENT_TXT_LEN; @@ -552,6 +545,7 @@ export async function syncOneFile({ const res = await syncSpreadSheet({ connectorId, file, + parentInternalId, }); if (!res.isSupported) { return false; diff --git a/connectors/src/connectors/microsoft/temporal/spreadsheets.ts b/connectors/src/connectors/microsoft/temporal/spreadsheets.ts index 1b94de02bc14..ff8505123b01 100644 --- a/connectors/src/connectors/microsoft/temporal/spreadsheets.ts +++ b/connectors/src/connectors/microsoft/temporal/spreadsheets.ts @@ -23,7 +23,8 @@ const MAXIMUM_NUMBER_OF_EXCEL_SHEET_ROWS = 50000; async function upsertSpreadsheetInDb( connector: ConnectorResource, internalId: string, - file: microsoftgraph.DriveItem + file: microsoftgraph.DriveItem, + parentInternalId: string ) { return MicrosoftNodeResource.upsert({ internalId, @@ -34,13 +35,15 @@ async function upsertSpreadsheetInDb( mimeType: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", lastUpsertedTs: new Date(), + parentInternalId, }); } async function upsertWorksheetInDb( connector: ConnectorResource, internalId: string, - worksheet: microsoftgraph.WorkbookWorksheet + worksheet: microsoftgraph.WorkbookWorksheet, + parentInternalId: string ) { return MicrosoftNodeResource.upsert({ internalId, @@ -50,6 +53,7 @@ async function upsertWorksheetInDb( name: worksheet.name ?? "", mimeType: "text/csv", lastUpsertedTs: new Date(), + parentInternalId, }); } @@ -95,7 +99,8 @@ async function processSheet( connector: ConnectorResource, spreadsheet: microsoftgraph.DriveItem, internalId: string, - worksheet: microsoftgraph.WorkbookWorksheet + worksheet: microsoftgraph.WorkbookWorksheet, + spreadsheetId: string ): Promise { if (!worksheet.id) { return false; @@ -144,7 +149,7 @@ async function processSheet( loggerArgs ); - await upsertWorksheetInDb(connector, internalId, worksheet); + await upsertWorksheetInDb(connector, internalId, worksheet, spreadsheetId); return true; } @@ -160,9 +165,11 @@ async function processSheet( export async function syncSpreadSheet({ connectorId, file, + parentInternalId, }: { connectorId: number; file: microsoftgraph.DriveItem; + parentInternalId: string; }): Promise< | { isSupported: false; @@ -199,7 +206,12 @@ export async function syncSpreadSheet({ getWorksheets(client, documentId, nextLink) ); - const spreadsheet = await upsertSpreadsheetInDb(connector, documentId, file); + const spreadsheet = await upsertSpreadsheetInDb( + connector, + documentId, + file, + parentInternalId + ); // List synced sheets. const syncedWorksheets = await spreadsheet.fetchChildren(); @@ -213,7 +225,8 @@ export async function syncSpreadSheet({ connector, file, internalWorkSheetId, - worksheet + worksheet, + documentId ); if (isImported) { successfulSheetIdImports.push(internalWorkSheetId); diff --git a/connectors/src/resources/microsoft_resource.ts b/connectors/src/resources/microsoft_resource.ts index 3da8cd00edd9..9f1bccbc90a3 100644 --- a/connectors/src/resources/microsoft_resource.ts +++ b/connectors/src/resources/microsoft_resource.ts @@ -1,8 +1,12 @@ import type { ModelId, Result } from "@dust-tt/types"; import { Ok } from "@dust-tt/types"; import type { Attributes, ModelStatic, Transaction } from "sequelize"; +import { Op } from "sequelize"; -import type { MicrosoftNode } from "@connectors/connectors/microsoft/lib/types"; +import type { + MicrosoftNode, + MicrosoftNodeType, +} from "@connectors/connectors/microsoft/lib/types"; import { concurrentExecutor } from "@connectors/lib/async_utils"; import { MicrosoftConfigurationModel, @@ -280,6 +284,15 @@ export class MicrosoftNodeResource extends BaseResource { return resource; } + static async fetchNodesWithoutParents() { + const blobs = await this.model.findAll({ + where: { + parentInternalId: null, + }, + }); + return blobs.map((blob) => new this(this.model, blob.get())); + } + static async fetchByInternalIds(connectorId: ModelId, internalIds: string[]) { const blobs = await this.model.findAll({ where: { @@ -291,13 +304,24 @@ export class MicrosoftNodeResource extends BaseResource { return blobs.map((blob) => new this(this.model, blob.get())); } - async fetchChildren() { + async fetchChildren(nodeTypes?: MicrosoftNodeType[]) { + const whereClause: { + connectorId: number; + parentInternalId: string; + nodeType?: { [Op.in]: MicrosoftNodeType | MicrosoftNodeType[] }; + } = { + connectorId: this.connectorId, + parentInternalId: this.internalId, + }; + + if (nodeTypes) { + whereClause.nodeType = { [Op.in]: nodeTypes }; + } + const blobs = await this.model.findAll({ - where: { - connectorId: this.connectorId, - parentInternalId: this.internalId, - }, + where: whereClause, }); + if (!blobs) { return []; }