From 7a42081730ceac4c8dec7aa244648715feb182c8 Mon Sep 17 00:00:00 2001 From: Mark McDonald Date: Wed, 18 Oct 2023 05:49:53 +0800 Subject: [PATCH] Limit embedding requests to text samples under 10k (#155) The `embeddings-gecko-001` model doesn't seem to like anything over 10k, so filter them out in the tutorial. --- site/en/examples/clustering_with_embeddings.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/site/en/examples/clustering_with_embeddings.ipynb b/site/en/examples/clustering_with_embeddings.ipynb index 31f3e0f63..da3fac750 100644 --- a/site/en/examples/clustering_with_embeddings.ipynb +++ b/site/en/examples/clustering_with_embeddings.ipynb @@ -418,6 +418,8 @@ "df_train['Label'] = newsgroups_train.target\n", "# Match label to target name index\n", "df_train['Class Name'] = df_train['Label'].map(newsgroups_train.target_names.__getitem__)\n", + "# Retain text samples that can be used in the gecko model.\n", + "df_train = df_train[df_train['Text'].str.len() < 10000]\n", "\n", "df_train" ]