From ce2b1ad8d2176d70c4d44aac51ee7f0043be8d7b Mon Sep 17 00:00:00 2001 From: MahimaRamireddy Date: Mon, 13 May 2024 16:36:08 +0530 Subject: [PATCH] Added sentiment analysis function --- .../clustering movie review-checkpoint.ipynb | 107 ++++++++++++++++++ clustering movie review.ipynb | 96 ++++------------ 2 files changed, 131 insertions(+), 72 deletions(-) create mode 100644 .ipynb_checkpoints/clustering movie review-checkpoint.ipynb diff --git a/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb new file mode 100644 index 00000000..7d129ee4 --- /dev/null +++ b/.ipynb_checkpoints/clustering movie review-checkpoint.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.metrics import silhouette_score\n", + "from nltk.sentiment import SentimentIntensityAnalyzer\n", + "\n", + "# Load NLTK's sentiment analyzer\n", + "sid = SentimentIntensityAnalyzer()\n", + "\n", + "data = pd.read_csv('Product listing.csv')\n", + "\n", + "# Data preprocessing\n", + "def preprocess_text(text):\n", + " # Convert text to lowercase\n", + " text = text.lower()\n", + " # Tokenization can be done using regex or libraries like NLTK or spaCy\n", + " # Here, a simple split by space is used\n", + " tokens = text.split()\n", + " # Remove stopwords (you may need to download the stopwords list for your language)\n", + " stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])\n", + " tokens = [token for token in tokens if token not in stopwords]\n", + " return ' '.join(tokens)\n", + "\n", + "data['clean_text'] = data['product'].apply(preprocess_text)\n", + "\n", + "# TF-IDF vectorization\n", + "tfidf_vectorizer = TfidfVectorizer(max_features=1000) # You can adjust max_features as needed\n", + "tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n", + "\n", + "# Clustering with K-means\n", + "k = 5 # Number of clusters (you can adjust this)\n", + "kmeans = KMeans(n_clusters=k, random_state=42)\n", + "kmeans.fit(tfidf_matrix)\n", + "\n", + "# Assign cluster labels to each review\n", + "data['cluster_label'] = kmeans.labels_\n", + "\n", + "# Sentiment Analysis\n", + "def get_sentiment(text):\n", + " # NLTK's sentiment analyzer\n", + " sentiment_scores = sid.polarity_scores(text)\n", + " # Classify sentiment based on compound score\n", + " if sentiment_scores['compound'] >= 0.05:\n", + " return 'Positive'\n", + " elif sentiment_scores['compound'] <= -0.05:\n", + " return 'Negative'\n", + " else:\n", + " return 'Neutral'\n", + " \n", + "data['sentiment'] = data['clean_text'].apply(get_sentiment)\n", + "\n", + "\n", + "# Evaluate clustering using silhouette score\n", + "silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n", + "print(f\"Silhouette Score: {silhouette_avg}\")\n", + "\n", + "# Display some reviews from each cluster\n", + "for cluster_id in range(k):\n", + " cluster_samples = data[data['cluster_label'] == cluster_id].sample(5) # Displaying 5 samples per cluster\n", + " print(f\"\\nCluster {cluster_id}:\")\n", + " for index, row in cluster_samples.iterrows():\n", + " print(row['product'])\n", + " print(\"Sentiment:\", row['sentiment'])\n", + " print('-' * 50)\n", + "\n", + "# You can further analyze the clusters and refine the process as needed\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/clustering movie review.ipynb b/clustering movie review.ipynb index c77a47af..80e50bd6 100644 --- a/clustering movie review.ipynb +++ b/clustering movie review.ipynb @@ -2,83 +2,19 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Silhouette Score: 0.057004055728191866\n", - "\n", - "Cluster 0:\n", - "Acer ED322QR 31.5 Inch (80.01 cm) Full HD Curved VA Backlit LED Monitor I 144Hz Refresh Rate I Zero Frame I AMD Free Sync I Eye Care Features I Stereo Speakers\n", - "--------------------------------------------------\n", - "HP 3.1 USB HP 32 GB Flash Drive\n", - "--------------------------------------------------\n", - "Logitech MX Anywhere 3 Compact Performance Mouse – Wireless, Magnetic Scrolling, Ergonomic, 4000DPI Sensor, Custom Buttons, USB-C, Bluetooth, Apple Mac, iPad, Windows PC, Linux, Chrome - Graphite\n", - "--------------------------------------------------\n", - "SanDisk Cruzer Blade 32GB USB Flash Drive\n", - "--------------------------------------------------\n", - "APLT-Portable Slim Wireless Mouse for Laptops 2.4Ghz Silent Wireless Optical Mouse for Laptop, Desktop ( White)\n", - "--------------------------------------------------\n", - "\n", - "Cluster 1:\n", - "Zebronics Zeb-Corolla In Ear Wired Earphone with Mic, 3.5mm Jack, 1.2 Meter Cable, Multi Function Button\n", - "--------------------------------------------------\n", - "MINISO We Bare Bears in-Ear Wired Headphones with Microphone, Comfortable Earbuds Cute Earphones for Mobile Smartphones Apple Xiaomi Realme Oppo Samsung and More - Brown\n", - "--------------------------------------------------\n", - "pTron Tangent Evo with 14Hrs Playback, Bluetooth 5.0 Wireless Headphones with Deep Bass, IPX4 Water Resistance, Ergonomic & Snug-fit, Voice Assistance, Magnetic Earbuds & Built-in HD Mic (Black)\n", - "--------------------------------------------------\n", - "Ambrane Dots 38 True Wireless Earbuds TWS with Pure HD Bass, 16H Playtime, IPX4 Waterproof, Responsive Touch Sensors for Multifunctions, Compact Type-C Charging Case (Green), Normal\n", - "--------------------------------------------------\n", - "Peripage A6 203dpi Thermal Label Printer Inkless Pocket Printer Bluetooth Connection Office Assistant/Life Helper DIY Printing Travel Recorder for iOS/Android/Windows\n", - "--------------------------------------------------\n", - "\n", - "Cluster 2:\n", - "AVITA LIBER V NS14A8INF542-CS Thin and Light 14 inch (35.56cm) Laptop( Intel Core i5-10210U/ 8GB/256GB SSD /Win 10 Home/ Backlit Keyboard/ Fingerprint Sensor/ MSO 365) 1.28kg, Cloud Silver\n", - "--------------------------------------------------\n", - "(Renewed) HP ProBook 7th Gen Core i5 Laptop, 16 GB RAM, 240GB NVME SSD, Intel HD Graphics, 15.6 inch (39.62 cms) FHD Screen, Win 10, MS Office, Backlit Keyboard, Fingerprint sensor, Black\n", - "--------------------------------------------------\n", - "CHIST Gaming Desktop Intel Core i5 8GB,GT 710 2GB Graphic Card, 19 Full HD Monitor, Keyboard Mouse, Wi-Fi Ready to Play (120GB SSD 1TB HDD)\n", - "--------------------------------------------------\n", - "(Renewed) Lenovo ThinkCenter M58 19-inch (48.26 cm) Desktop (Intel Core2 Duo 4 GB 500 GB HDD Windows 7 Professional MS Office), Black\n", - "--------------------------------------------------\n", - "Lenovo ThinkBook 15 Intel 11th Gen Core i5 15.6\" (39.62 cm) FHD IPS 300 nits Antiglare 100% sRGB Thin and Light Laptop (16GB/1TB HDD+128GB SSD/Windows 10/MS Office/Mineral Grey/1.7 Kg), 20VEA0HKIH\n", - "--------------------------------------------------\n", - "\n", - "Cluster 3:\n", - "Mi 80 cm (32 inches) Horizon Edition HD Ready Android Smart LED TV 4A|L32M6-EI (Grey)\n", - "--------------------------------------------------\n", - "Foxsky 127 cm (50 inches) 4K Ultra HD Smart LED TV 50FS-VS (Black) (2021 Model) | With Voice Assistant\n", - "--------------------------------------------------\n", - "Kevin 80 cm (32 Inches) HD Ready Smart LED TV KN32A (Black) (2021 Model) | With Alexa Built-in\n", - "--------------------------------------------------\n", - "Samsung 108 cm (43 inches) 4K Ultra HD Smart QLED TV QA43Q60AAKLXL (Black) (2021 Model)\n", - "--------------------------------------------------\n", - "eAirtec 60 cm (24 Inches) HD Ready Smart Android LED TV 24DJSmart (Black) (2021 Model)\n", - "--------------------------------------------------\n", - "\n", - "Cluster 4:\n", - "Ovista- 10000mAH Digital Display Power Bank with inbuilt 4 in 1 Cable USB Input Port with Fast Charging 10000mAh Slim Power Bank with 5V/2A Fast Charging (Model-PRB035)- Black\n", - "--------------------------------------------------\n", - "Ambrane 5000mAh Li-Polymer Powerbank with Fast Charging & Compact Size (PP-501, Pink)\n", - "--------------------------------------------------\n", - "URBN 20000mAh Li-Polymer Ultra Compact Type-C Power Bank with 12W Fast Charge, Type C & Micro Input (Black)\n", - "--------------------------------------------------\n", - "Conekt 10000mAh Li-Polymer Powerbank Zeal Proton Pro (White)\n", - "--------------------------------------------------\n", - "Zeal PL-10000 10400mAh Power Bank\n", - "--------------------------------------------------\n" - ] - } - ], + "outputs": [], "source": [ "\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.cluster import KMeans\n", "from sklearn.metrics import silhouette_score\n", + "from nltk.sentiment import SentimentIntensityAnalyzer\n", + "\n", + "# Load NLTK's sentiment analyzer\n", + "sid = SentimentIntensityAnalyzer()\n", "\n", "data = pd.read_csv('Product listing.csv')\n", "\n", @@ -108,6 +44,21 @@ "# Assign cluster labels to each review\n", "data['cluster_label'] = kmeans.labels_\n", "\n", + "# Sentiment Analysis\n", + "def get_sentiment(text):\n", + " # NLTK's sentiment analyzer\n", + " sentiment_scores = sid.polarity_scores(text)\n", + " # Classify sentiment based on compound score\n", + " if sentiment_scores['compound'] >= 0.05:\n", + " return 'Positive'\n", + " elif sentiment_scores['compound'] <= -0.05:\n", + " return 'Negative'\n", + " else:\n", + " return 'Neutral'\n", + " \n", + "data['sentiment'] = data['clean_text'].apply(get_sentiment)\n", + "\n", + "\n", "# Evaluate clustering using silhouette score\n", "silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n", "print(f\"Silhouette Score: {silhouette_avg}\")\n", @@ -118,6 +69,7 @@ " print(f\"\\nCluster {cluster_id}:\")\n", " for index, row in cluster_samples.iterrows():\n", " print(row['product'])\n", + " print(\"Sentiment:\", row['sentiment'])\n", " print('-' * 50)\n", "\n", "# You can further analyze the clusters and refine the process as needed\n" @@ -126,7 +78,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -140,7 +92,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.4" } }, "nbformat": 4,