Add similarity computation and cleanup functions

to utils.py
jsolly · Nov 11, 2023 · d5b9f0e · d5b9f0e
1 parent 4e53a4b
commit d5b9f0e
Showing 1 changed file with 52 additions and 1 deletion.
diff --git a/app/blog/utils.py b/app/blog/utils.py
@@ -2,6 +2,11 @@
 from openai.embeddings_utils import distances_from_embeddings
 import pickle
 from pathlib import Path
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from .models import Post, Similarity
+from django.db import models
 from django.core.exceptions import ValidationError
 import re
 
@@ -54,7 +59,7 @@ def create_context(question, df, max_len=1800, size="ada"):
 
 
 def answer_question(
-    model="text-davinci-003",
+    model="gpt-3.5-turbo-instruct",
     question=None,
     max_len=1800,
     size="ada",
@@ -91,6 +96,52 @@ def answer_question(
     return response["choices"][0]["text"].strip()
 
 
+def cleanup_similarities(post: Post) -> None:
+    # Get all Similarity instances related to the post and order by score descending
+    all_similarities = Similarity.objects.filter(
+        models.Q(post1=post) | models.Q(post2=post)
+    ).order_by("-score")
+
+    # Keep the top 3 similarities
+    top_similarities = all_similarities[:3]
+
+    # Exclude the top similarities and delete the rest
+    all_similarities.exclude(
+        id__in=top_similarities.values_list("id", flat=True)
+    ).delete()
+
+
+def compute_similarity(post_id: int) -> None:
+    post = Post.objects.get(id=post_id)
+    other_posts = Post.objects.exclude(id=post_id).exclude(content="")
+
+    if not other_posts:
+        return  # No other posts to compare, exit the function.
+
+    combined_texts = [f"{post.content} {post.title}"] + [
+        f"{op.content} {op.title}" for op in other_posts
+    ]
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(combined_texts)
+
+    if tfidf_matrix.shape[0] < 2:
+        return  # Not enough data to compute similarity, exit the function.
+
+    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
+    other_posts_pks = [op.pk for op in other_posts]
+    num_similar_posts = min(len(other_posts_pks), 3)
+    top_indices = np.argsort(-cosine_sim[0])[:num_similar_posts]
+
+    for idx in top_indices:
+        Similarity.objects.update_or_create(
+            post1=post,
+            post2=Post.objects.get(pk=other_posts_pks[idx]),
+            defaults={"score": cosine_sim[0][idx]},
+        )
+
+    cleanup_similarities(post)
+
+
 link_media_pattern = re.compile(
     r"<a.*?/a>|<img.*?/img>|<video.*?/video>|<audio.*?/audio>", flags=re.IGNORECASE
 )