Skip to content

Commit

Permalink
Put identical page path with different query string parameters as sib… (
Browse files Browse the repository at this point in the history
#3285)

* Put identical page path with different query string parameters as siblings

* Replacing ? by /
  • Loading branch information
lasryaric authored Jan 18, 2024
1 parent 26bfd42 commit b458cd5
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 10 deletions.
8 changes: 2 additions & 6 deletions connectors/src/connectors/webcrawler/index.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { ConnectorResource, ModelId } from "@dust-tt/types";

import {
getDisplayNameForPage,
normalizeFolderUrl,
stableIdForUrl,
} from "@connectors/connectors/webcrawler/lib/utils";
Expand Down Expand Up @@ -152,7 +153,6 @@ export async function retrieveWebcrawlerConnectorPermissions({
})
.concat(
pages.map((page): ConnectorResource => {
const parsedUrl = new URL(page.url);
const isFileAndFolder = excludedFoldersSet.has(
normalizeFolderUrl(page.url)
);
Expand All @@ -170,11 +170,7 @@ export async function retrieveWebcrawlerConnectorPermissions({
ressourceType: "folder",
})
: null,
title:
parsedUrl.pathname
.split("/")
.filter((x) => x)
.pop() || parsedUrl.origin,
title: getDisplayNameForPage(page.url),
sourceUrl: page.url,
expandable: isFileAndFolder ? true : false,
permission: "read",
Expand Down
34 changes: 30 additions & 4 deletions connectors/src/connectors/webcrawler/lib/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ export function getAllFoldersForUrl(url: string) {
// eg: https://example.com/foo/bar -> https://example.com/foo
// eg: https://example.com/foo -> https://example.com/
export function getFolderForUrl(url: string) {
const parsed = new URL(url);
const normalized = normalizeFolderUrl(url);
const parsed = new URL(normalized);
const urlParts = parsed.pathname.split("/").filter((part) => part.length > 0);
if (parsed.pathname === "/") {
return null;
Expand All @@ -66,12 +67,37 @@ export function isTopFolder(url: string) {
// Normalizes a url path by removing trailing slashes and empty path parts (eg: //)
export function normalizeFolderUrl(url: string) {
const parsed = new URL(url);
return (
let result =
parsed.origin +
"/" +
parsed.pathname
.split("/")
.filter((x) => x)
.join("/")
);
.join("/");

if (parsed.search.length > 0) {
// Replace the leading ? with a /
result += "/" + parsed.search.slice(1);
}

return result;
}

export function getDisplayNameForPage(url: string): string {
const parsed = new URL(url);
let result = "";
const fragments = parsed.pathname.split("/").filter((x) => x);
const lastFragment = fragments.pop();
if (lastFragment) {
result += lastFragment;
}
if (parsed.search.length > 0) {
result += parsed.search;
}

if (!result) {
result = parsed.origin;
}

return result;
}

0 comments on commit b458cd5

Please sign in to comment.