Skip to content

Commit

Permalink
Add similarity computation and cleanup functions
Browse files Browse the repository at this point in the history
to utils.py
  • Loading branch information
freedompraise committed Nov 11, 2023
1 parent 4e53a4b commit d5b9f0e
Showing 1 changed file with 52 additions and 1 deletion.
53 changes: 52 additions & 1 deletion app/blog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
from openai.embeddings_utils import distances_from_embeddings
import pickle
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from .models import Post, Similarity
from django.db import models
from django.core.exceptions import ValidationError
import re

Expand Down Expand Up @@ -54,7 +59,7 @@ def create_context(question, df, max_len=1800, size="ada"):


def answer_question(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
question=None,
max_len=1800,
size="ada",
Expand Down Expand Up @@ -91,6 +96,52 @@ def answer_question(
return response["choices"][0]["text"].strip()


def cleanup_similarities(post: Post) -> None:
# Get all Similarity instances related to the post and order by score descending
all_similarities = Similarity.objects.filter(
models.Q(post1=post) | models.Q(post2=post)
).order_by("-score")

# Keep the top 3 similarities
top_similarities = all_similarities[:3]

# Exclude the top similarities and delete the rest
all_similarities.exclude(
id__in=top_similarities.values_list("id", flat=True)
).delete()


def compute_similarity(post_id: int) -> None:
post = Post.objects.get(id=post_id)
other_posts = Post.objects.exclude(id=post_id).exclude(content="")

if not other_posts:
return # No other posts to compare, exit the function.

combined_texts = [f"{post.content} {post.title}"] + [
f"{op.content} {op.title}" for op in other_posts
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_texts)

if tfidf_matrix.shape[0] < 2:
return # Not enough data to compute similarity, exit the function.

cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
other_posts_pks = [op.pk for op in other_posts]
num_similar_posts = min(len(other_posts_pks), 3)
top_indices = np.argsort(-cosine_sim[0])[:num_similar_posts]

for idx in top_indices:
Similarity.objects.update_or_create(
post1=post,
post2=Post.objects.get(pk=other_posts_pks[idx]),
defaults={"score": cosine_sim[0][idx]},
)

cleanup_similarities(post)


link_media_pattern = re.compile(
r"<a.*?/a>|<img.*?/img>|<video.*?/video>|<audio.*?/audio>", flags=re.IGNORECASE
)
Expand Down

0 comments on commit d5b9f0e

Please sign in to comment.