From 5d47c6efe77814fa939b8271a4b5a9c2d2ec9392 Mon Sep 17 00:00:00 2001 From: Stanislas Polu Date: Thu, 11 Jan 2024 16:01:44 +0100 Subject: [PATCH] connectors: renderDocumentTitleAndContent (#3151) * WIP refactor * wip * clean-up renderDocumentForTitleAndContent * updatedAt and createdAt * Adapt slack bot.ts * comments * nit * nit * nit * s/lastUpdatedAt/updatedAt * review pr * rename * fix title for non threaded messages in slack --- .../connectors/github/temporal/activities.ts | 51 ++++++------ .../google_drive/temporal/activities.ts | 15 ++-- .../connectors/notion/temporal/activities.ts | 22 +++--- connectors/src/connectors/slack/bot.ts | 14 ++-- .../connectors/slack/temporal/activities.ts | 76 +++++++++++------- connectors/src/lib/data_sources.ts | 77 ++++++++++--------- 6 files changed, 144 insertions(+), 111 deletions(-) diff --git a/connectors/src/connectors/github/temporal/activities.ts b/connectors/src/connectors/github/temporal/activities.ts index ab5e532eeaad..eb38864a43be 100644 --- a/connectors/src/connectors/github/temporal/activities.ts +++ b/connectors/src/connectors/github/temporal/activities.ts @@ -20,6 +20,7 @@ import { } from "@connectors/connectors/github/lib/github_api"; import { deleteFromDataSource, + renderDocumentTitleAndContent, renderMarkdownSection, upsertToDatasource, } from "@connectors/lib/data_sources"; @@ -108,11 +109,12 @@ async function renderIssue( const issue = await getIssue(installationId, repoName, login, issueNumber); - const content = renderMarkdownSection( - `Issue #${issue.number} [${repoName}]: ${issue.title}\n`, - issue.body || "", - { flavor: "gfm" } - ); + const content = renderDocumentTitleAndContent({ + title: `Issue #${issue.number} [${repoName}]: ${issue.title}`, + createdAt: issue.createdAt, + updatedAt: issue.updatedAt, + content: renderMarkdownSection(issue.body ?? "", { flavor: "gfm" }), + }); let resultPage = 1; let lastCommentUpdateTime: Date | null = null; @@ -147,11 +149,11 @@ async function renderIssue( for (const comment of comments) { if (comment.body) { - const c = renderMarkdownSection( - `>> ${renderGithubUser(comment.creator)}:\n`, - comment.body, - { flavor: "gfm" } - ); + const c = { + prefix: `>> ${renderGithubUser(comment.creator)}:\n`, + content: null, + sections: [renderMarkdownSection(comment.body, { flavor: "gfm" })], + }; content.sections.push(c); } if ( @@ -280,11 +282,10 @@ async function renderDiscussion( discussionNumber ); - const content = renderMarkdownSection( - `Discussion #${discussion.number} [${repoName}]: ${discussion.title}\n`, - discussion.bodyText, - { flavor: "gfm" } - ); + const content = renderDocumentTitleAndContent({ + title: `Discussion #${discussion.number} [${repoName}]: ${discussion.title}`, + content: renderMarkdownSection(discussion.bodyText, { flavor: "gfm" }), + }); let nextCursor: string | null = null; @@ -311,9 +312,11 @@ async function renderDiscussion( prefix += "[ACCEPTED ANSWER] "; } prefix += `${comment.author?.login || "Unknown author"}:\n`; - const c = renderMarkdownSection(prefix, comment.bodyText, { - flavor: "gfm", - }); + const c = { + prefix, + content: null, + sections: [renderMarkdownSection(comment.bodyText, { flavor: "gfm" })], + }; content.sections.push(c); let nextChildCursor: string | null = null; @@ -333,11 +336,13 @@ async function renderDiscussion( ); for (const childComment of childComments) { - const cc = renderMarkdownSection( - `>> ${childComment.author?.login || "Unknown author"}:\n`, - childComment.bodyText, - { flavor: "gfm" } - ); + const cc = { + prefix: `>> ${childComment.author?.login || "Unknown author"}:\n`, + content: null, + sections: [ + renderMarkdownSection(comment.bodyText, { flavor: "gfm" }), + ], + }; c.sections.push(cc); } diff --git a/connectors/src/connectors/google_drive/temporal/activities.ts b/connectors/src/connectors/google_drive/temporal/activities.ts index a4cd51936df7..40098eae74a6 100644 --- a/connectors/src/connectors/google_drive/temporal/activities.ts +++ b/connectors/src/connectors/google_drive/temporal/activities.ts @@ -7,7 +7,7 @@ import PQueue from "p-queue"; import { deleteFromDataSource, MAX_DOCUMENT_TXT_LEN, - renderSectionForTitleAndContent, + renderDocumentTitleAndContent, upsertToDatasource, } from "@connectors/lib/data_sources"; import { HTTPError } from "@connectors/lib/error"; @@ -509,11 +509,14 @@ async function syncOneFile( return false; } - // Add the title of the file to the beginning of the document. - const content = renderSectionForTitleAndContent( - file.name, - documentContent || null - ); + const content = renderDocumentTitleAndContent({ + title: file.name, + updatedAt: file.updatedAtMs ? new Date(file.updatedAtMs) : undefined, + createdAt: file.createdAtMs ? new Date(file.createdAtMs) : undefined, + content: documentContent + ? { prefix: null, content: documentContent, sections: [] } + : null, + }); if (documentContent === undefined) { logger.error( diff --git a/connectors/src/connectors/notion/temporal/activities.ts b/connectors/src/connectors/notion/temporal/activities.ts index b8389319db06..9523986ec94c 100644 --- a/connectors/src/connectors/notion/temporal/activities.ts +++ b/connectors/src/connectors/notion/temporal/activities.ts @@ -41,7 +41,7 @@ import { import { deleteFromDataSource, MAX_DOCUMENT_TXT_LEN, - renderSectionForTitleAndContent, + renderDocumentTitleAndContent, upsertToDatasource, } from "@connectors/lib/data_sources"; import { ExternalOauthTokenError } from "@connectors/lib/error"; @@ -1734,8 +1734,8 @@ export async function renderAndUpsertPageFromCache({ skipReason = "body_too_large"; } - const createdTime = new Date(pageCacheEntry.createdTime).getTime(); - const updatedTime = new Date(pageCacheEntry.lastEditedTime).getTime(); + const createdAt = new Date(pageCacheEntry.createdTime); + const updatedAt = new Date(pageCacheEntry.lastEditedTime); if (!pageHasBody) { localLogger.info( @@ -1753,10 +1753,12 @@ export async function renderAndUpsertPageFromCache({ runTimestamp.toString() ); - const content = renderSectionForTitleAndContent( - title || null, - renderedPage - ); + const content = renderDocumentTitleAndContent({ + title: title ?? null, + createdAt: createdAt, + updatedAt: updatedAt, + content: { prefix: null, content: renderedPage, sections: [] }, + }); localLogger.info( "notionRenderAndUpsertPageFromCache: Upserting to Data Source." @@ -1770,13 +1772,13 @@ export async function renderAndUpsertPageFromCache({ documentId, documentContent: content, documentUrl: pageCacheEntry.url, - timestampMs: updatedTime, + timestampMs: updatedAt.getTime(), tags: getTagsForPage({ title, author, lastEditor, - createdTime, - updatedTime, + createdTime: createdAt.getTime(), + updatedTime: updatedAt.getTime(), }), parents, retries: 3, diff --git a/connectors/src/connectors/slack/bot.ts b/connectors/src/connectors/slack/bot.ts index 983fbba39fc8..b0f36459974a 100644 --- a/connectors/src/connectors/slack/bot.ts +++ b/connectors/src/connectors/slack/bot.ts @@ -756,13 +756,13 @@ async function makeContentFragment( return new Err(new Error("Could not retrieve channel name")); } - const content = await formatMessagesForUpsert( - channelId, - channel.channel.name, - allMessages, - connector.id, - slackClient - ); + const content = await formatMessagesForUpsert({ + channelName: channel.channel.name, + messages: allMessages, + isThread: true, + connectorId: connector.id, + slackClient, + }); let url: string | null = null; if (allMessages[0]?.ts) { diff --git a/connectors/src/connectors/slack/temporal/activities.ts b/connectors/src/connectors/slack/temporal/activities.ts index 119334ea1fd8..dbc3a759d75d 100644 --- a/connectors/src/connectors/slack/temporal/activities.ts +++ b/connectors/src/connectors/slack/temporal/activities.ts @@ -30,6 +30,7 @@ import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_c import { cacheGet, cacheSet } from "@connectors/lib/cache"; import { deleteFromDataSource, + renderDocumentTitleAndContent, upsertToDatasource, } from "@connectors/lib/data_sources"; import { WorkflowError } from "@connectors/lib/error"; @@ -422,13 +423,13 @@ export async function syncNonThreaded( } messages.reverse(); - const content = await formatMessagesForUpsert( - channelId, + const content = await formatMessagesForUpsert({ channelName, messages, + isThread: false, connectorId, - client - ); + slackClient: client, + }); const startDate = new Date(startTsMs); const endDate = new Date(endTsMs); @@ -598,13 +599,13 @@ export async function syncThread( return; } - const content = await formatMessagesForUpsert( - channelId, + const content = await formatMessagesForUpsert({ channelName, - allMessages, + messages: allMessages, + isThread: true, connectorId, - slackClient - ); + slackClient, + }); const firstMessage = allMessages[0]; let sourceUrl: string | undefined = undefined; @@ -670,13 +671,19 @@ async function processMessageForMentions( return message; } -export async function formatMessagesForUpsert( - channelId: string, - channelName: string, - messages: MessageElement[], - connectorId: ModelId, - slackClient: WebClient -): Promise { +export async function formatMessagesForUpsert({ + channelName, + messages, + isThread, + connectorId, + slackClient, +}: { + channelName: string; + messages: MessageElement[]; + isThread: boolean; + connectorId: ModelId; + slackClient: WebClient; +}): Promise { const data = await Promise.all( messages.map(async (message) => { const text = await processMessageForMentions( @@ -694,6 +701,7 @@ export async function formatMessagesForUpsert( const messageDateStr = formatDateForUpsert(messageDate); return { + messageDate, dateStr: messageDateStr, userName, text: text, @@ -703,22 +711,32 @@ export async function formatMessagesForUpsert( }) ); - const first = data[0]; - if (!first) { + const first = data.at(0); + const last = data.at(-1); + if (!last || !first) { throw new Error("Cannot format empty list of messages"); } - return { - prefix: `Thread in #${channelName} [${first.dateStr}]: ${ - first.text.replace(/\s+/g, " ").trim().substring(0, 128) + "..." - }\n`, - content: null, - sections: data.map((d) => ({ - prefix: `>> @${d.userName} [${d.dateStr}]:\n`, - content: d.text + "\n", - sections: [], - })), - }; + const title = isThread + ? `Thread in #${channelName}: ${ + first.text.replace(/\s+/g, " ").trim().substring(0, 128) + "..." + }` + : `Messages in #${channelName}`; + + return renderDocumentTitleAndContent({ + title, + createdAt: first.messageDate, + updatedAt: last.messageDate, + content: { + prefix: null, + content: null, + sections: data.map((d) => ({ + prefix: `>> @${d.userName} [${d.dateStr}]:\n`, + content: d.text + "\n", + sections: [], + })), + }, + }); } export async function fetchUsers(connectorId: ModelId) { diff --git a/connectors/src/lib/data_sources.ts b/connectors/src/lib/data_sources.ts index 2bbdd211043b..3c07a3719ef9 100644 --- a/connectors/src/lib/data_sources.ts +++ b/connectors/src/lib/data_sources.ts @@ -229,40 +229,12 @@ async function _updateDocumentParentsField({ } } -const MAX_SECTION_PREFIX_LENGTH = 128; - -export function renderSectionForTitleAndContent( - title: string | null, - content: string | null -): CoreAPIDataSourceDocumentSection { - if (!title || !title.trim()) { - return { - prefix: null, - content, - sections: [], - }; - } - - if (title.length > MAX_SECTION_PREFIX_LENGTH) { - return { - prefix: `$title: ${title.substring(0, MAX_SECTION_PREFIX_LENGTH)}...\n\n`, - content: `... ${title.substring(MAX_SECTION_PREFIX_LENGTH)}\n\n`, - sections: [ - { - prefix: null, - content, - sections: [], - }, - ], - }; - } - return { - prefix: `$title: ${title}\n\n`, - content, - sections: [], - }; -} +const MAX_SECTION_PREFIX_LENGTH = 192; +// The role of this function is to create a prefix from an arbitrary long string. The prefix +// provided will not be augmented with `\n`, so it should include appropriate carriage return. If +// the prefix is too long (>256 chars), it will be truncated. The remained will be returned as +// content of the resulting section. export function renderPrefixSection( prefix: string | null ): CoreAPIDataSourceDocumentSection { @@ -276,7 +248,7 @@ export function renderPrefixSection( if (prefix.length > MAX_SECTION_PREFIX_LENGTH) { return { prefix: prefix.substring(0, MAX_SECTION_PREFIX_LENGTH) + "...\n", - content: `... ${prefix.substring(MAX_SECTION_PREFIX_LENGTH)}\n`, + content: `... ${prefix.substring(MAX_SECTION_PREFIX_LENGTH)}`, sections: [], }; } @@ -291,7 +263,6 @@ export function renderPrefixSection( /// The top-level node is always with prefix and content null and can be edited to add a prefix or /// content. export function renderMarkdownSection( - prefix: string | null, markdown: string, { flavor }: { flavor?: "gfm" } = {} ): CoreAPIDataSourceDocumentSection { @@ -303,7 +274,7 @@ export function renderMarkdownSection( mdastExtensions: mdastExtensions, }); - const top = renderPrefixSection(prefix); + const top = { prefix: null, content: null, sections: [] }; let path: { depth: number; content: CoreAPIDataSourceDocumentSection }[] = [ { depth: 0, content: top }, @@ -342,3 +313,37 @@ export function renderMarkdownSection( return top; } + +// Will render the document based on title, optional createdAt and updatedAt and a structured +// content. The title, createdAt and updatedAt will be presented in a standardized way across +// connectors. The title should not include any `\n`. +// If the title is too long it will be truncated and the remainder of the title will be set as +// content of the top-level section. +export function renderDocumentTitleAndContent({ + title, + createdAt, + updatedAt, + content, +}: { + title: string | null; + createdAt?: Date; + updatedAt?: Date; + content: CoreAPIDataSourceDocumentSection | null; +}): CoreAPIDataSourceDocumentSection { + if (title && title.trim()) { + title = `$title: ${title}\n`; + } else { + title = null; + } + const c = renderPrefixSection(title); + if (createdAt) { + c.prefix += `$createdAt: ${createdAt.toISOString()}\n`; + } + if (updatedAt) { + c.prefix += `$updatedAt: ${updatedAt.toISOString()}\n`; + } + if (content) { + c.sections.push(content); + } + return c; +}