Skip to content

Commit

Permalink
Added sentiment analysis function
Browse files Browse the repository at this point in the history
  • Loading branch information
MahimaRamireddy committed May 13, 2024
1 parent 86b8bd4 commit ce2b1ad
Show file tree
Hide file tree
Showing 2 changed files with 131 additions and 72 deletions.
107 changes: 107 additions & 0 deletions .ipynb_checkpoints/clustering movie review-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"from nltk.sentiment import SentimentIntensityAnalyzer\n",
"\n",
"# Load NLTK's sentiment analyzer\n",
"sid = SentimentIntensityAnalyzer()\n",
"\n",
"data = pd.read_csv('Product listing.csv')\n",
"\n",
"# Data preprocessing\n",
"def preprocess_text(text):\n",
" # Convert text to lowercase\n",
" text = text.lower()\n",
" # Tokenization can be done using regex or libraries like NLTK or spaCy\n",
" # Here, a simple split by space is used\n",
" tokens = text.split()\n",
" # Remove stopwords (you may need to download the stopwords list for your language)\n",
" stopwords = set(['the', 'and', 'is', 'in', 'to', 'it', 'this', 'of', 'for', 'with', 'as'])\n",
" tokens = [token for token in tokens if token not in stopwords]\n",
" return ' '.join(tokens)\n",
"\n",
"data['clean_text'] = data['product'].apply(preprocess_text)\n",
"\n",
"# TF-IDF vectorization\n",
"tfidf_vectorizer = TfidfVectorizer(max_features=1000) # You can adjust max_features as needed\n",
"tfidf_matrix = tfidf_vectorizer.fit_transform(data['clean_text'])\n",
"\n",
"# Clustering with K-means\n",
"k = 5 # Number of clusters (you can adjust this)\n",
"kmeans = KMeans(n_clusters=k, random_state=42)\n",
"kmeans.fit(tfidf_matrix)\n",
"\n",
"# Assign cluster labels to each review\n",
"data['cluster_label'] = kmeans.labels_\n",
"\n",
"# Sentiment Analysis\n",
"def get_sentiment(text):\n",
" # NLTK's sentiment analyzer\n",
" sentiment_scores = sid.polarity_scores(text)\n",
" # Classify sentiment based on compound score\n",
" if sentiment_scores['compound'] >= 0.05:\n",
" return 'Positive'\n",
" elif sentiment_scores['compound'] <= -0.05:\n",
" return 'Negative'\n",
" else:\n",
" return 'Neutral'\n",
" \n",
"data['sentiment'] = data['clean_text'].apply(get_sentiment)\n",
"\n",
"\n",
"# Evaluate clustering using silhouette score\n",
"silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n",
"print(f\"Silhouette Score: {silhouette_avg}\")\n",
"\n",
"# Display some reviews from each cluster\n",
"for cluster_id in range(k):\n",
" cluster_samples = data[data['cluster_label'] == cluster_id].sample(5) # Displaying 5 samples per cluster\n",
" print(f\"\\nCluster {cluster_id}:\")\n",
" for index, row in cluster_samples.iterrows():\n",
" print(row['product'])\n",
" print(\"Sentiment:\", row['sentiment'])\n",
" print('-' * 50)\n",
"\n",
"# You can further analyze the clusters and refine the process as needed\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
96 changes: 24 additions & 72 deletions clustering movie review.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,83 +2,19 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Silhouette Score: 0.057004055728191866\n",
"\n",
"Cluster 0:\n",
"Acer ED322QR 31.5 Inch (80.01 cm) Full HD Curved VA Backlit LED Monitor I 144Hz Refresh Rate I Zero Frame I AMD Free Sync I Eye Care Features I Stereo Speakers\n",
"--------------------------------------------------\n",
"HP 3.1 USB HP 32 GB Flash Drive\n",
"--------------------------------------------------\n",
"Logitech MX Anywhere 3 Compact Performance Mouse – Wireless, Magnetic Scrolling, Ergonomic, 4000DPI Sensor, Custom Buttons, USB-C, Bluetooth, Apple Mac, iPad, Windows PC, Linux, Chrome - Graphite\n",
"--------------------------------------------------\n",
"SanDisk Cruzer Blade 32GB USB Flash Drive\n",
"--------------------------------------------------\n",
"APLT-Portable Slim Wireless Mouse for Laptops 2.4Ghz Silent Wireless Optical Mouse for Laptop, Desktop ( White)\n",
"--------------------------------------------------\n",
"\n",
"Cluster 1:\n",
"Zebronics Zeb-Corolla In Ear Wired Earphone with Mic, 3.5mm Jack, 1.2 Meter Cable, Multi Function Button\n",
"--------------------------------------------------\n",
"MINISO We Bare Bears in-Ear Wired Headphones with Microphone, Comfortable Earbuds Cute Earphones for Mobile Smartphones Apple Xiaomi Realme Oppo Samsung and More - Brown\n",
"--------------------------------------------------\n",
"pTron Tangent Evo with 14Hrs Playback, Bluetooth 5.0 Wireless Headphones with Deep Bass, IPX4 Water Resistance, Ergonomic & Snug-fit, Voice Assistance, Magnetic Earbuds & Built-in HD Mic (Black)\n",
"--------------------------------------------------\n",
"Ambrane Dots 38 True Wireless Earbuds TWS with Pure HD Bass, 16H Playtime, IPX4 Waterproof, Responsive Touch Sensors for Multifunctions, Compact Type-C Charging Case (Green), Normal\n",
"--------------------------------------------------\n",
"Peripage A6 203dpi Thermal Label Printer Inkless Pocket Printer Bluetooth Connection Office Assistant/Life Helper DIY Printing Travel Recorder for iOS/Android/Windows\n",
"--------------------------------------------------\n",
"\n",
"Cluster 2:\n",
"AVITA LIBER V NS14A8INF542-CS Thin and Light 14 inch (35.56cm) Laptop( Intel Core i5-10210U/ 8GB/256GB SSD /Win 10 Home/ Backlit Keyboard/ Fingerprint Sensor/ MSO 365) 1.28kg, Cloud Silver\n",
"--------------------------------------------------\n",
"(Renewed) HP ProBook 7th Gen Core i5 Laptop, 16 GB RAM, 240GB NVME SSD, Intel HD Graphics, 15.6 inch (39.62 cms) FHD Screen, Win 10, MS Office, Backlit Keyboard, Fingerprint sensor, Black\n",
"--------------------------------------------------\n",
"CHIST Gaming Desktop Intel Core i5 8GB,GT 710 2GB Graphic Card, 19 Full HD Monitor, Keyboard Mouse, Wi-Fi Ready to Play (120GB SSD 1TB HDD)\n",
"--------------------------------------------------\n",
"(Renewed) Lenovo ThinkCenter M58 19-inch (48.26 cm) Desktop (Intel Core2 Duo 4 GB 500 GB HDD Windows 7 Professional MS Office), Black\n",
"--------------------------------------------------\n",
"Lenovo ThinkBook 15 Intel 11th Gen Core i5 15.6\" (39.62 cm) FHD IPS 300 nits Antiglare 100% sRGB Thin and Light Laptop (16GB/1TB HDD+128GB SSD/Windows 10/MS Office/Mineral Grey/1.7 Kg), 20VEA0HKIH\n",
"--------------------------------------------------\n",
"\n",
"Cluster 3:\n",
"Mi 80 cm (32 inches) Horizon Edition HD Ready Android Smart LED TV 4A|L32M6-EI (Grey)\n",
"--------------------------------------------------\n",
"Foxsky 127 cm (50 inches) 4K Ultra HD Smart LED TV 50FS-VS (Black) (2021 Model) | With Voice Assistant\n",
"--------------------------------------------------\n",
"Kevin 80 cm (32 Inches) HD Ready Smart LED TV KN32A (Black) (2021 Model) | With Alexa Built-in\n",
"--------------------------------------------------\n",
"Samsung 108 cm (43 inches) 4K Ultra HD Smart QLED TV QA43Q60AAKLXL (Black) (2021 Model)\n",
"--------------------------------------------------\n",
"eAirtec 60 cm (24 Inches) HD Ready Smart Android LED TV 24DJSmart (Black) (2021 Model)\n",
"--------------------------------------------------\n",
"\n",
"Cluster 4:\n",
"Ovista- 10000mAH Digital Display Power Bank with inbuilt 4 in 1 Cable USB Input Port with Fast Charging 10000mAh Slim Power Bank with 5V/2A Fast Charging (Model-PRB035)- Black\n",
"--------------------------------------------------\n",
"Ambrane 5000mAh Li-Polymer Powerbank with Fast Charging & Compact Size (PP-501, Pink)\n",
"--------------------------------------------------\n",
"URBN 20000mAh Li-Polymer Ultra Compact Type-C Power Bank with 12W Fast Charge, Type C & Micro Input (Black)\n",
"--------------------------------------------------\n",
"Conekt 10000mAh Li-Polymer Powerbank Zeal Proton Pro (White)\n",
"--------------------------------------------------\n",
"Zeal PL-10000 10400mAh Power Bank\n",
"--------------------------------------------------\n"
]
}
],
"outputs": [],
"source": [
"\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.metrics import silhouette_score\n",
"from nltk.sentiment import SentimentIntensityAnalyzer\n",
"\n",
"# Load NLTK's sentiment analyzer\n",
"sid = SentimentIntensityAnalyzer()\n",
"\n",
"data = pd.read_csv('Product listing.csv')\n",
"\n",
Expand Down Expand Up @@ -108,6 +44,21 @@
"# Assign cluster labels to each review\n",
"data['cluster_label'] = kmeans.labels_\n",
"\n",
"# Sentiment Analysis\n",
"def get_sentiment(text):\n",
" # NLTK's sentiment analyzer\n",
" sentiment_scores = sid.polarity_scores(text)\n",
" # Classify sentiment based on compound score\n",
" if sentiment_scores['compound'] >= 0.05:\n",
" return 'Positive'\n",
" elif sentiment_scores['compound'] <= -0.05:\n",
" return 'Negative'\n",
" else:\n",
" return 'Neutral'\n",
" \n",
"data['sentiment'] = data['clean_text'].apply(get_sentiment)\n",
"\n",
"\n",
"# Evaluate clustering using silhouette score\n",
"silhouette_avg = silhouette_score(tfidf_matrix, kmeans.labels_)\n",
"print(f\"Silhouette Score: {silhouette_avg}\")\n",
Expand All @@ -118,6 +69,7 @@
" print(f\"\\nCluster {cluster_id}:\")\n",
" for index, row in cluster_samples.iterrows():\n",
" print(row['product'])\n",
" print(\"Sentiment:\", row['sentiment'])\n",
" print('-' * 50)\n",
"\n",
"# You can further analyze the clusters and refine the process as needed\n"
Expand All @@ -126,7 +78,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -140,7 +92,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.4"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ce2b1ad

Please sign in to comment.