Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[front] dev: dustapps sync script #6733

Merged
merged 16 commits into from
Aug 29, 2024
9 changes: 9 additions & 0 deletions .husky/post-merge
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/sh

current_branch=$(git symbolic-ref --short HEAD)
if [ "$current_branch" = "main" ]
then
./front/admin/copy_apps.sh
fi

exit 0
9 changes: 9 additions & 0 deletions .husky/pre-rebase
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

current_branch=$(git symbolic-ref --short HEAD)
if [ "$current_branch" = "main" ]
then
./front/admin/copy_apps.sh
fi

exit 0
17 changes: 17 additions & 0 deletions front/admin/cli.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import type { DustRegistryActionName } from "@dust-tt/types";
import {
CLAUDE_3_OPUS_DEFAULT_MODEL_CONFIG,
ConnectorsAPI,
DustProdActionRegistry,
removeNulls,
SUPPORTED_MODEL_CONFIGS,
} from "@dust-tt/types";
Expand Down Expand Up @@ -560,6 +562,19 @@ const transcripts = async (command: string, args: parseArgs.ParsedArgs) => {
}
};

const registry = async (command: string) => {
switch (command) {
case "dump": {
console.log(JSON.stringify(DustProdActionRegistry));
return;
}

default:
console.log(`Unknown registry command: ${command}`);
console.log("Possible values: `dump`");
}
};

const main = async () => {
const argv = parseArgs(process.argv.slice(2));

Expand Down Expand Up @@ -589,6 +604,8 @@ const main = async () => {
return conversation(command, argv);
case "transcripts":
return transcripts(command, argv);
case "registry":
return registry(command);
default:
console.log(
"Unknown object type, possible values: `workspace`, `user`, `data-source`, `event-schema`, `conversation`, `transcripts`"
Expand Down
147 changes: 147 additions & 0 deletions front/admin/copy_apps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#!/bin/bash
DIR=$(dirname $0)

function escaped_columns_list {
echo $* | sed -E 's/ /,/g'| sed -E 's/([a-zA-Z_]+)/\\"\1\\"/g'
}
function escaped_values_list {
echo $* | sed -E 's/ /,/g'| sed -E 's/([a-zA-Z_]+)/\\\\'\1\\\\'/g'
}
function columns_list {
echo $* | sed -E 's/ /,/g'| sed -E 's/([a-zA-Z_]+)/"\1"/g'
}
function updates_clause {
echo $* | sed -E 's/ /,/g'| sed -E 's/([a-zA-Z_]+)/"\1"=__copy."\1"/g'
}
function copy_clause {
echo $* | sed -E 's/ /,/g'| sed -E 's/([a-zA-Z_]+)/__copy."\1"/g'
}
function fetch {
database_uri=${1}
table_name=${2}
cols_to_fetch=${3}
where_clause=${4}

echo "Fetching ${table_name} from ${PRODBOX_POD_NAME}..."
kubectl exec -it ${PRODBOX_POD_NAME} -- bash -c "psql \$${database_uri}_DATABASE_URI -c \"COPY (SELECT $(escaped_columns_list ${cols_to_fetch}) FROM ${table_name} WHERE ${where_clause}) TO STDOUT;\"" > /tmp/dust-apps/${database_uri}_${table_name}.csv
}

function import {
database_uri=${1}
table_name=${2}
cols_to_import=${3}
cols_to_update=${4}
on_conflict=${5}
additional_where=${6}

eval uri='$'${database_uri}_DATABASE_URI
echo -n "Preparing ${table_name}... "
psql ${uri} -c "drop table if exists __copy" > /dev/null 2>&1
psql ${uri} -c "create table __copy as (select * from ${table_name} limit 0)" | tr -d '\n'
echo -n "... Importing ${table_name}... "
psql ${uri} -c "COPY __copy ($(columns_list ${cols_to_import})) from stdin;" < /tmp/dust-apps/${database_uri}_${table_name}.csv | tr -d '\n'
echo -n "... Updating existing ${table_name}... "
psql ${uri} -c "update ${table_name} set $(updates_clause $cols_to_update) from __copy where ${table_name}.id = __copy.id;" | tr -d '\n'
echo -n "... Inserting new ${table_name}..."
psql ${uri} -c "insert into ${table_name} ($(columns_list ${cols_to_import})) (select $(copy_clause ${cols_to_import}) from __copy left join ${table_name} using(id) where ${table_name} is null ${additional_where}) ${on_conflict};" | tr -d '\n'
echo -n "... Cleaning up ${table_name}... "
psql ${uri} -c "drop table if exists __copy;"
tdraier marked this conversation as resolved.
Show resolved Hide resolved
}

if [ -z "$DUST_APPS_SYNC_WORKSPACE_ID" ]
then
echo "Please set DUST_APPS_SYNC_WORKSPACE_ID if you want to synchronize dust-apps."
exit 0
fi

mkdir -p /tmp/dust-apps

cd ${DIR}/..

if [ "$1" == "--cleanup" ]
then
psql ${FRONT_DATABASE_URI} -c "delete from apps where id>1000;"
psql ${CORE_DATABASE_URI} -c "delete from specifications where id>10000;"
psql ${CORE_DATABASE_URI} -c "delete from datasets_joins where id>10000; delete from datasets_points where id>10000; delete from datasets where id>10000;"
fi

if [ "$1" != "--force" ]
then
./admin/cli.sh registry dump > /tmp/dust-apps/specs 2> /dev/null

# Get the appIds in the registry
REGISTRY_APP_IDS=$(cat /tmp/dust-apps/specs | jq -r '[.[].app.appId] | sort_by(.) | join("\n")')
# Reads appHash values from JSON, escapes them for shell usage, and concatenates them with commas for SQL queries.
IN_CLAUSE=$(jq -r '[.[].app.appHash] | map("\(. | @sh)") | join(",")' /tmp/dust-apps/specs)
# Get projects matching the current specifications
PROJECTS=$(psql $CORE_DATABASE_URI -c "copy (select distinct(project) from specifications where hash in (${IN_CLAUSE})) to stdout" | sed "s/.*/'&'/" | paste -sd, -)
# Get appIds matching the specifications
LOCAL_APP_IDS=$(psql $FRONT_DATABASE_URI -c "copy (select distinct(\"sId\") from apps where \"dustAPIProjectId\" in (${PROJECTS}) and visibility!='deleted' and \"workspaceId\"=${DUST_APPS_SYNC_WORKSPACE_ID} order by \"sId\") to stdout" | paste -sd\ -)

# Check if any app is missing
MISSING=false
for item in $REGISTRY_APP_IDS
do
if [[ ! " ${LOCAL_APP_IDS} " =~ " $item " ]]
then
echo "Missing app $item"
MISSING=true
fi
done

if [ "$MISSING" == "false" ]
then
echo "All apps available, skipping."
rm -R /tmp/dust-apps
exit 0
fi
fi

echo "Will copy apps into workspace ${DUST_APPS_SYNC_WORKSPACE_ID}..."
echo "You'll have to manually update front/lib/api/config.ts to use localhost:3000 instead of dust.tt,"
echo "and front/lib/development.ts / types/src/front/lib/actions/registry.ts to set your workspace sId in PRODUCTION_DUST_APPS_WORKSPACE_ID"
echo "Ensure you have valid env variables for DUST_MANAGED_ANTHROPIC_API_KEY, DUST_MANAGED_SERP_API_KEY and DUST_MANAGED_BROWSERLESS_API_KEY."
set -e

echo "Fetching prodbox pod..."
PRODBOX_POD_NAME=$(kubectl get pods |grep prodbox |cut -d \ -f1)

# ---- front

fetch FRONT apps "id createdAt updatedAt sId name description visibility savedSpecification savedConfig savedRun dustAPIProjectId workspaceId" "\\\"workspaceId\\\"=5069"
project_ids=$(cut -f 11 /tmp/dust-apps/FRONT_apps.csv |paste -sd "," -)
fetch FRONT datasets "id createdAt updatedAt name description schema appId workspaceId" "\\\"workspaceId\\\"=5069"


# ---- apps
cat /tmp/dust-apps/FRONT_apps.csv | cut -f1-11 | sed -E "s/^(.*)$/\1\t${DUST_APPS_SYNC_WORKSPACE_ID}/g" > /tmp/dust-apps/FRONT_apps_transformed.csv
mv /tmp/dust-apps/FRONT_apps_transformed.csv /tmp/dust-apps/FRONT_apps.csv
import FRONT apps "id createdAt updatedAt sId name description visibility savedSpecification savedConfig savedRun dustAPIProjectId workspaceId" "updatedAt name description visibility savedSpecification savedConfig savedRun dustAPIProjectId"

# ---- datasets
cat /tmp/dust-apps/FRONT_datasets.csv | cut -f1-7 | sed -E "s/^(.*)$/\1\t${DUST_APPS_SYNC_WORKSPACE_ID}/g" > /tmp/dust-apps/FRONT_datasets_transformed.csv
mv /tmp/dust-apps/FRONT_datasets_transformed.csv /tmp/dust-apps/FRONT_datasets.csv
import FRONT datasets "id createdAt updatedAt name description schema appId workspaceId" "updatedAt name description schema"

# ---- core

fetch CORE projects "id" "\\\"id\\\" in (${project_ids})"
fetch CORE specifications "id project created hash specification" "\\\"project\\\" in (${project_ids})"
fetch CORE datasets "id project created dataset_id hash" "\\\"project\\\" in (${project_ids})"
dataset_ids=$(cut -f 1 /tmp/dust-apps/CORE_datasets.csv |paste -sd "," -)
fetch CORE datasets_joins "id dataset point point_idx" "\\\"dataset\\\" in (${dataset_ids})"
dataset_points_ids=$(cut -f 3 /tmp/dust-apps/CORE_datasets_joins.csv |paste -sd "," -)
fetch CORE datasets_points "id hash json" "\\\"id\\\" in (${dataset_points_ids})"

# ---- projects
import CORE projects "id" "id"

# ---- specifications
import CORE specifications "id project created hash specification" "hash specification"

# ---- datasets
import CORE datasets "id project created dataset_id hash" "hash"
import CORE datasets_points "id hash json" "hash json" "on conflict(hash) do nothing"
import CORE datasets_joins "id dataset point point_idx" "point point_idx" "" "and __copy.point in (select id from datasets_points)"

rm -R /tmp/dust-apps
Loading