diff --git a/connectors/src/connectors/webcrawler/index.ts b/connectors/src/connectors/webcrawler/index.ts index a9d15406d3d3..8c39915d340e 100644 --- a/connectors/src/connectors/webcrawler/index.ts +++ b/connectors/src/connectors/webcrawler/index.ts @@ -1,6 +1,7 @@ import type { ConnectorResource, ModelId } from "@dust-tt/types"; import { + getDisplayNameForPage, normalizeFolderUrl, stableIdForUrl, } from "@connectors/connectors/webcrawler/lib/utils"; @@ -152,7 +153,6 @@ export async function retrieveWebcrawlerConnectorPermissions({ }) .concat( pages.map((page): ConnectorResource => { - const parsedUrl = new URL(page.url); const isFileAndFolder = excludedFoldersSet.has( normalizeFolderUrl(page.url) ); @@ -170,11 +170,7 @@ export async function retrieveWebcrawlerConnectorPermissions({ ressourceType: "folder", }) : null, - title: - parsedUrl.pathname - .split("/") - .filter((x) => x) - .pop() || parsedUrl.origin, + title: getDisplayNameForPage(page.url), sourceUrl: page.url, expandable: isFileAndFolder ? true : false, permission: "read", diff --git a/connectors/src/connectors/webcrawler/lib/utils.ts b/connectors/src/connectors/webcrawler/lib/utils.ts index e2e57fdfd69e..d5b41e8d1a7c 100644 --- a/connectors/src/connectors/webcrawler/lib/utils.ts +++ b/connectors/src/connectors/webcrawler/lib/utils.ts @@ -48,7 +48,8 @@ export function getAllFoldersForUrl(url: string) { // eg: https://example.com/foo/bar -> https://example.com/foo // eg: https://example.com/foo -> https://example.com/ export function getFolderForUrl(url: string) { - const parsed = new URL(url); + const normalized = normalizeFolderUrl(url); + const parsed = new URL(normalized); const urlParts = parsed.pathname.split("/").filter((part) => part.length > 0); if (parsed.pathname === "/") { return null; @@ -66,12 +67,37 @@ export function isTopFolder(url: string) { // Normalizes a url path by removing trailing slashes and empty path parts (eg: //) export function normalizeFolderUrl(url: string) { const parsed = new URL(url); - return ( + let result = parsed.origin + "/" + parsed.pathname .split("/") .filter((x) => x) - .join("/") - ); + .join("/"); + + if (parsed.search.length > 0) { + // Replace the leading ? with a / + result += "/" + parsed.search.slice(1); + } + + return result; +} + +export function getDisplayNameForPage(url: string): string { + const parsed = new URL(url); + let result = ""; + const fragments = parsed.pathname.split("/").filter((x) => x); + const lastFragment = fragments.pop(); + if (lastFragment) { + result += lastFragment; + } + if (parsed.search.length > 0) { + result += parsed.search; + } + + if (!result) { + result = parsed.origin; + } + + return result; }