Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh(notion) change how we process dbs #4173

Merged
merged 11 commits into from
Mar 6, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions connectors/src/connectors/notion/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,8 @@ export async function retrieveNotionConnectorPermissions({
sourceUrl: db.notionUrl || null,
expandable: true,
permission: "read",
dustDocumentId: null,
lastUpdatedAt: null,
dustDocumentId: `notion-database-${db.notionDatabaseId}`,
lastUpdatedAt: db.structuredDataUpsertedTs?.getTime() ?? null,
};
};

Expand Down
4 changes: 3 additions & 1 deletion connectors/src/connectors/notion/lib/notion_api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1043,7 +1043,9 @@ export function parsePageBlock(block: BlockObjectResponse): ParsedNotionBlock {
case "child_database":
return {
...commonFields,
text: null,
text: `Child Database: ${
block.child_database.title ?? "Untitled Database"
}`,
childDatabaseTitle: block.child_database.title,
};

Expand Down
127 changes: 66 additions & 61 deletions connectors/src/connectors/notion/temporal/activities.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1245,7 +1245,6 @@ export async function cacheBlockChildren({
}): Promise<{
nextCursor: string | null;
blocksWithChildren: string[];
childDatabases: string[];
blocksCount: number;
}> {
const connector = await ConnectorResource.fetchById(connectorId);
Expand All @@ -1269,7 +1268,6 @@ export async function cacheBlockChildren({
nextCursor: null,
blocksWithChildren: [],
blocksCount: 0,
childDatabases: [],
};
}

Expand Down Expand Up @@ -1300,7 +1298,6 @@ export async function cacheBlockChildren({
nextCursor: null,
blocksWithChildren: [],
blocksCount: 0,
childDatabases: [],
};
}

Expand Down Expand Up @@ -1342,16 +1339,11 @@ export async function cacheBlockChildren({
.filter((b) => b.hasChildren)
.map((b) => b.id);

const childDatabases = parsedBlocks
.filter((b) => b.type === "child_database")
.map((b) => b.id);

localLogger.info(
{
blocksWithChildrenCount: blocksWithChildren.length,
childDatabasesCount: childDatabases.length,
},
"Found blocks with children and child databases."
"Found blocks with children."
);

localLogger.info(
Expand All @@ -1376,7 +1368,6 @@ export async function cacheBlockChildren({
);

return {
childDatabases,
blocksWithChildren,
blocksCount: parsedBlocks.length,
nextCursor: resultPage.next_cursor,
Expand Down Expand Up @@ -1730,46 +1721,6 @@ export async function renderAndUpsertPageFromCache({
];
}

const childDatabaseTitleById = blockCacheEntries
.filter((b) => b.blockType === "child_database")
.map((b) => ({
id: b.notionBlockId,
title:
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
b.childDatabaseTitle!,
}))
.reduce((acc, { id, title }) => {
acc[id] = title;
return acc;
}, {} as Record<string, string>);

localLogger.info(
"notionRenderAndUpsertPageFromCache: Retrieving child database pages from cache."
);
const childDbPagesCacheEntries = await NotionConnectorPageCacheEntry.findAll({
where: {
parentId: Object.keys(blocksByParentId),
connectorId: connector.id,
workflowId: topLevelWorkflowId,
},
});
const childDatabases: Record<string, NotionConnectorPageCacheEntry[]> = {};
for (const childDbPageCacheEntry of childDbPagesCacheEntries) {
childDatabases[childDbPageCacheEntry.parentId] = [
...(childDatabases[childDbPageCacheEntry.parentId] ?? []),
childDbPageCacheEntry,
];
}
const renderedChildDatabases: Record<string, string> = {};
for (const [databaseId, pages] of Object.entries(childDatabases)) {
renderedChildDatabases[databaseId] = await renderDatabaseFromPages({
databaseTitle: childDatabaseTitleById[databaseId] ?? null,
pagesProperties: pages.map(
(p) => JSON.parse(p.pagePropertiesText) as PageObjectProperties
),
});
}

localLogger.info("notionRenderAndUpsertPageFromCache: Rendering page.");
const renderedPageSection = await renderPageSection({
dsConfig,
Expand All @@ -1782,11 +1733,16 @@ export async function renderAndUpsertPageFromCache({

// Adding notion properties to the page rendering
// We skip the title as it is added separately as prefix to the top-level document section.
let maxPropertyLength = 0;
const parsedProperties = parsePageProperties(
JSON.parse(pageCacheEntry.pagePropertiesText) as PageObjectProperties
);
for (const p of parsedProperties.filter((p) => p.key !== "title" && p.text)) {
for (const p of parsedProperties.filter((p) => p.key !== "title")) {
if (!p.text) {
continue;
}
const propertyContent = `$${p.key}: ${p.text}\n`;
maxPropertyLength = Math.max(maxPropertyLength, p.text.length);
renderedPageSection.sections.unshift({
prefix: null,
content: propertyContent,
Expand Down Expand Up @@ -1893,9 +1849,10 @@ export async function renderAndUpsertPageFromCache({
const createdAt = new Date(pageCacheEntry.createdTime);
const updatedAt = new Date(pageCacheEntry.lastEditedTime);

if (documentLength === 0) {
if (documentLength === 0 && maxPropertyLength < 256) {
localLogger.info(
"notionRenderAndUpsertPageFromCache: Not upserting page without body."
{ maxPropertyLength },
"notionRenderAndUpsertPageFromCache: Not upserting page without body and free text properties."
);
} else if (!skipReason) {
upsertTs = new Date().getTime();
Expand Down Expand Up @@ -1937,7 +1894,6 @@ export async function renderAndUpsertPageFromCache({
updatedTime: updatedAt.getTime(),
parsedProperties,
}),

parents,
loggerArgs,
upsertContext: {
Expand Down Expand Up @@ -2307,11 +2263,13 @@ export async function upsertDatabaseStructuredDataFromCache({
connectorId,
topLevelWorkflowId,
loggerArgs,
runTimestamp,
}: {
databaseId: string;
connectorId: number;
topLevelWorkflowId: string;
loggerArgs: Record<string, string | number>;
runTimestamp: number;
}): Promise<void> {
const connector = await ConnectorResource.fetchById(connectorId);
if (!connector) {
Expand Down Expand Up @@ -2350,21 +2308,28 @@ export async function upsertDatabaseStructuredDataFromCache({
return;
}

const pagesProperties = pageCacheEntries.map(
(p) => JSON.parse(p.pagePropertiesText) as PageObjectProperties
);

const csv = await renderDatabaseFromPages({
databaseTitle: null,
pagesProperties: pageCacheEntries.map(
(p) => JSON.parse(p.pagePropertiesText) as PageObjectProperties
),
pagesProperties,
dustIdColumn: pageCacheEntries.map((p) => `notion-${p.notionPageId}`),
cellSeparator: ",",
rowBoundary: "",
});

const { tableId, tableName, tableDescription } =
const { databaseName, tableId, tableName, tableDescription } =
getTableInfoFromDatabase(dbModel);

const dataSourceConfig = dataSourceConfigFromConnector(connector);

const upsertAt = new Date();

localLogger.info("Upserting Notion Database as Table.");
await upsertTableFromCsv({
dataSourceConfig: dataSourceConfigFromConnector(connector),
dataSourceConfig,
tableId,
tableName,
tableDescription,
Expand All @@ -2373,10 +2338,50 @@ export async function upsertDatabaseStructuredDataFromCache({
// We overwrite the whole table since we just fetched all child pages.
truncate: true,
});
await dbModel.update({ structuredDataUpsertedTs: new Date() });
// Same as above, but without the `dustId` column
const csvForDocument = await renderDatabaseFromPages({
databaseTitle: null,
pagesProperties,
cellSeparator: ",",
rowBoundary: "",
});
const csvHeader = csvForDocument.split("\n")[0];
const csvRows = csvForDocument.split("\n").slice(1).join("\n");
if (csvHeader && csvRows.length) {
const parents = await getParents(
connector.id,
databaseId,
new Set<string>(),
runTimestamp.toString()
);
localLogger.info("Upserting Notion Database as Document.");
await upsertToDatasource({
dataSourceConfig,
documentId: `notion-database-${databaseId}`,
documentContent: {
prefix: csvHeader,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thiiiink we want the DB name in the prefix no?
Also if csvHeader is > than half chunk size the upsert will fail.

You can likely use the functions that have been built by @philipperolet that are meant to do that but maybe the numbers are a bit tight for this use case.

In any case we want to tokenize and split at half chunk size aka 256 tokens.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is outdated right?

content: csvRows,
sections: [],
},
documentUrl: dbModel.notionUrl ?? undefined,
// TODO: see if we actually want to use the Notion last edited time of the database
// we currently don't have it because we don't fetch the DB object from notion.
timestampMs: upsertAt.getTime(),
tags: [`title:${databaseName}`, "is_database:true"],
parents: parents,
loggerArgs,
upsertContext: {
sync_type: "batch",
},
async: true,
});
}

await dbModel.update({ structuredDataUpsertedTs: upsertAt });
}

function getTableInfoFromDatabase(database: NotionDatabase): {
databaseName: string;
tableId: string;
tableName: string;
tableDescription: string;
Expand All @@ -2390,5 +2395,5 @@ function getTableInfoFromDatabase(database: NotionDatabase): {
);

const tableDescription = `Structured data from Notion Database ${tableName}`;
return { tableId, tableName, tableDescription };
return { databaseName: name, tableId, tableName, tableDescription };
}
2 changes: 1 addition & 1 deletion connectors/src/connectors/notion/temporal/config.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
export const WORKFLOW_VERSION = 33;
export const WORKFLOW_VERSION = 34;
export const QUEUE_NAME = `notion-queue-v${WORKFLOW_VERSION}`;
55 changes: 3 additions & 52 deletions connectors/src/connectors/notion/temporal/workflows.ts
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ export async function upsertPageChildWorkflow({
let cursor: string | null = null;
let blockIndexInPage = 0;
do {
const { nextCursor, blocksWithChildren, childDatabases, blocksCount } =
const { nextCursor, blocksWithChildren, blocksCount } =
await cacheBlockChildren({
connectorId,
pageId,
Expand All @@ -458,17 +458,6 @@ export async function upsertPageChildWorkflow({
memo: workflowInfo().memo,
});
}
for (const databaseId of childDatabases) {
await executeChild(processChildDatabaseChildWorkflow, {
workflowId: `${topLevelWorkflowId}-page-${pageId}-child-database-${databaseId}`,
searchAttributes: {
connectorId: [connectorId],
},
args: [{ connectorId, databaseId, topLevelWorkflowId }],
parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE,
memo: workflowInfo().memo,
});
}
} while (cursor);

await renderAndUpsertPageFromCache({
Expand Down Expand Up @@ -504,7 +493,7 @@ export async function notionProcessBlockChildrenChildWorkflow({
let blockIndexInParent = 0;

do {
const { nextCursor, blocksWithChildren, childDatabases, blocksCount } =
const { nextCursor, blocksWithChildren, blocksCount } =
await cacheBlockChildren({
connectorId,
pageId,
Expand All @@ -528,45 +517,6 @@ export async function notionProcessBlockChildrenChildWorkflow({
memo: workflowInfo().memo,
});
}
for (const databaseId of childDatabases) {
await executeChild(processChildDatabaseChildWorkflow, {
workflowId: `${topLevelWorkflowId}-page-${pageId}-child-database-${databaseId}`,
searchAttributes: {
connectorId: [connectorId],
},
args: [{ connectorId, databaseId, topLevelWorkflowId }],
parentClosePolicy: ParentClosePolicy.PARENT_CLOSE_POLICY_TERMINATE,
memo: workflowInfo().memo,
});
}
} while (cursor);
}

export async function processChildDatabaseChildWorkflow({
connectorId,
databaseId,
topLevelWorkflowId,
}: {
connectorId: ModelId;
databaseId: string;
topLevelWorkflowId: string;
}): Promise<void> {
const loggerArgs = {
connectorId,
};

let cursor: string | null = null;
do {
const { nextCursor } = await fetchDatabaseChildPages({
connectorId,
databaseId,
cursor,
loggerArgs,
topLevelWorkflowId,
storeInCache: true,
returnUpToDatePageIdsForExistingDatabase: true,
});
cursor = nextCursor;
} while (cursor);
}

Expand Down Expand Up @@ -753,6 +703,7 @@ async function upsertDatabase({
connectorId,
topLevelWorkflowId,
loggerArgs,
runTimestamp,
})
);

Expand Down
Loading