# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/threads-an-instagram-app-reviews/threads_reviews.csv
!pip install googletrans==4.0.0rc1
Collecting googletrans==4.0.0rc1 Downloading googletrans-4.0.0rc1.tar.gz (20 kB) Preparing metadata (setup.py) ... done Collecting httpx==0.13.3 (from googletrans==4.0.0rc1) Downloading httpx-0.13.3-py3-none-any.whl (55 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.1/55.1 kB 988.0 kB/s eta 0:00:00a 0:00:01 Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from httpx==0.13.3->googletrans==4.0.0rc1) (2023.5.7) Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0rc1) Downloading hstspreload-2024.8.1-py3-none-any.whl (1.2 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 7.9 MB/s eta 0:00:0000:0100:01 Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx==0.13.3->googletrans==4.0.0rc1) (1.3.0) Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0rc1) Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.4/133.4 kB 12.2 MB/s eta 0:00:00 Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0rc1) Downloading idna-2.10-py2.py3-none-any.whl (58 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.8/58.8 kB 4.8 MB/s eta 0:00:00 Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0rc1) Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB) Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0rc1) Downloading httpcore-0.9.1-py3-none-any.whl (42 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.6/42.6 kB 3.3 MB/s eta 0:00:00 Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1) Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.6/53.6 kB 4.9 MB/s eta 0:00:00 Collecting h2==3.* (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1) Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.0/65.0 kB 6.2 MB/s eta 0:00:00 Collecting hyperframe<6,>=5.2.0 (from h2==3.*->httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1) Downloading hyperframe-5.2.0-py2.py3-none-any.whl (12 kB) Collecting hpack<4,>=3.0 (from h2==3.*->httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1) Downloading hpack-3.0.0-py2.py3-none-any.whl (38 kB) Building wheels for collected packages: googletrans Building wheel for googletrans (setup.py) ... done Created wheel for googletrans: filename=googletrans-4.0.0rc1-py3-none-any.whl size=17414 sha256=384316c1ffdc4d18fd046961935b5dc8e79c46922d8dca6a319aae272414d669 Stored in directory: /root/.cache/pip/wheels/c0/59/9f/7372f0cf70160fe61b528532e1a7c8498c4becd6bcffb022de Successfully built googletrans Installing collected packages: rfc3986, hyperframe, hpack, h11, chardet, idna, hstspreload, h2, httpcore, httpx, googletrans Attempting uninstall: h11 Found existing installation: h11 0.14.0 Uninstalling h11-0.14.0: Successfully uninstalled h11-0.14.0 Attempting uninstall: idna Found existing installation: idna 3.4 Uninstalling idna-3.4: Successfully uninstalled idna-3.4 ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. jupyterlab-lsp 4.2.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible. ydata-profiling 4.3.1 requires scipy<1.11,>=1.4.1, but you have scipy 1.11.1 which is incompatible. Successfully installed chardet-3.0.4 googletrans-4.0.0rc1 h11-0.9.0 h2-3.2.0 hpack-3.0.0 hstspreload-2024.8.1 httpcore-0.9.1 httpx-0.13.3 hyperframe-5.2.0 idna-2.10 rfc3986-1.5.0
!pip install emot
Collecting emot Downloading emot-3.1-py3-none-any.whl (61 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.5/61.5 kB 640.5 kB/s eta 0:00:00a 0:00:01 Installing collected packages: emot Successfully installed emot-3.1
!pip install emoji
Requirement already satisfied: emoji in /opt/conda/lib/python3.10/site-packages (2.6.0)
# # import emot
# # text = "hey🤣"
# # emot_object = emot.core.emot()
# # emot_object.emoji(text)
# # to_be_rep = emot_object.emoji(text)['value'][0]
# # rep_to = emot_object.emoji(text)['mean'][0]
# # re.sub(rf"{to_be_rep}", rep_to, text)
# import emoji
# text = 'I love Python ❤️, it is brilliant 👍'
# text2 = ":red_heartface_blowing_a_kissheart_with_ribbonsmiling_face_with_hearts:"
# print(emoji.demojize(text))
# print(emoji.emojize(text2))
import warnings
warnings.filterwarnings("ignore")
# -------------------------------------------------------
# For text analysis
# import spacy
# nlp = spacy.load("en_core_web_sm")
import re # for regular expression
import string
from googletrans import Translator, LANGUAGES
# Initialize the translator
translator = Translator()
puncs = string.punctuation
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
porter_stem = PorterStemmer()
stops = list(set(stopwords.words('english'))) # set of stopwords from the NLTK tool
defined_stop_words = ['app', 'instagram', 'twitter', 'facebook', 'account', 'profile', 'follower', 'followers',
'tweet', 'post', 'like', 'status', 'notification','comment', 'video', 'post', 'picture',
'tag', 'hashtag', 'story', 'status', 'notification', 'dm', 'message', 'mention', 'threads',
'application', 'insta', 'whatsapp', 'user', "n't", 'could', 'may', 'must', "i", "followe",
"people", "see", "posts"]
defined_stop_words+= list(puncs)
stops += defined_stop_words
import emot
emot_object = emot.core.emot()
import emoji
# -------------------------------------------------------
import random
# -------------------------------------------------------
# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# -------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
[nltk_data] Downloading package stopwords to /usr/share/nltk_data... [nltk_data] Package stopwords is already up-to-date!
%time
data = pd.read_csv("/kaggle/input/threads-an-instagram-app-reviews/threads_reviews.csv")
display(data.head())
print(f"Number of data points= {data.shape[0]}\nNumber of features= {data.shape[1]}")
CPU times: user 2 µs, sys: 0 ns, total: 2 µs Wall time: 7.63 µs
source | review_description | rating | review_date | |
---|---|---|---|---|
0 | Google Play | Meh. Not the greatest experience on a Chromebo... | 2 | 2023-07-08 14:18:24 |
1 | Google Play | Pretty good for a first launch!! Its easy to u... | 3 | 2023-07-19 20:52:48 |
2 | Google Play | For a brand new app, it's very well optimized.... | 3 | 2023-07-06 23:03:11 |
3 | Google Play | Great app with a lot of potential! However, th... | 3 | 2023-07-10 00:53:25 |
4 | Google Play | The app is good, but it needs a lot of functio... | 3 | 2023-07-06 16:57:43 |
Number of data points= 32910 Number of features= 4
Insights into our dataset¶
data.shape
(32910, 4)
# let's get insight into some data points.
indicies = random.choices([i for i in range(0, data.shape[0])], k = 10)
for index in indicies:
print("Text:", data['review_description'][index])
print("rating: ", data['rating'][index])
print("-"*100)
Text: it has the potential to be way better than twitter (which to be fair, it’s not that hard with how it’s current state is under elon) but there are some features that twitter has that are useful that aren’t on threads, for example an indicator on one’s profile saying if they are following you or not, also dms, polls, and many more options that can come in future updates. i feel like it would’ve been smarter to get more features like those in before release date rating: 4 ---------------------------------------------------------------------------------------------------- Text: Nice 🙏🙏🙏 rating: 5 ---------------------------------------------------------------------------------------------------- Text: Nicee rating: 4 ---------------------------------------------------------------------------------------------------- Text: So aku orang Malaysia pertama yang comment kat thread ni? 🇲🇾 ( I'm 1st Malaysian to comment and review here) ❗Subscribe my youtube channel (FakFitness) 😂 rating: 5 ---------------------------------------------------------------------------------------------------- Text: Hashtags, please!! rating: 4 ---------------------------------------------------------------------------------------------------- Text: Fix the gallery selections rating: 1 ---------------------------------------------------------------------------------------------------- Text: ऐलन मस्क तो गया अब 😆😆😆😆 rating: 5 ---------------------------------------------------------------------------------------------------- Text: Normal rating: 4 ---------------------------------------------------------------------------------------------------- Text: It's new give experience rating: 5 ---------------------------------------------------------------------------------------------------- Text: Best app rating: 5 ----------------------------------------------------------------------------------------------------
data.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
rating | 32910.0 | 3.398481 | 1.75148 | 1.0 | 1.0 | 4.0 | 5.0 | 5.0 |
Ratings histogram to get feeling of the ratings distribution¶
sns.displot(data, x='rating', shrink=0.7 ,hue='source', multiple="dodge", discrete=True)
plt.title("Ratings distribution")
plt.show()
word_counts = data['review_description'].str.split().explode().value_counts()
word_count_df = pd.DataFrame(word_counts).reset_index()
word_count_df.columns = ['word', 'frequency']
word_count_df.head(15)
word | frequency | |
---|---|---|
0 | to | 10563 |
1 | I | 9707 |
2 | the | 9190 |
3 | app | 8076 |
4 | and | 7565 |
5 | is | 6823 |
6 | a | 6740 |
7 | it | 5584 |
8 | of | 4734 |
9 | this | 4076 |
10 | for | 3067 |
11 | but | 2983 |
12 | you | 2954 |
13 | 2947 | |
14 | my | 2883 |
as we can see above we have a stopwords that we have to deal with to get better insight.¶
def translate_to_english(text):
translated = translator.translate(text, dest='en')
return translated.text
def text_process(text):
text = text.lower() # case folding to our text
text_tokens = nltk.word_tokenize(text) # tokenizing our text
text_no_stop = [token.lower() for token in text_tokens if token not in stops] # removing stop words
text_stemmed = [porter_stem.stem(token) for token in text_no_stop] # stemming
text = " ".join(text_no_stop)
text = re.sub("-", " ", text) # removing -
#text = emoji.demojize(text) # give the emojis a textual meaning, COMMENT THIS LINE WHEN YOU WANT TO VISUALIZE WORDS IN SCATTER PLOT ABOVE
text = re.sub(":", "",text)
return text
data['processed_text'] = data['review_description'].apply(lambda text : text_process(text))
data
source | review_description | rating | review_date | processed_text | |
---|---|---|---|---|---|
0 | Google Play | Meh. Not the greatest experience on a Chromebo... | 2 | 2023-07-08 14:18:24 | meh greatest experience chromebook seems custo... |
1 | Google Play | Pretty good for a first launch!! Its easy to u... | 3 | 2023-07-19 20:52:48 | pretty good first launch easy use self explana... |
2 | Google Play | For a brand new app, it's very well optimized.... | 3 | 2023-07-06 23:03:11 | brand new 's well optimized however 's missing... |
3 | Google Play | Great app with a lot of potential! However, th... | 3 | 2023-07-10 00:53:25 | great lot potential however lot needs fixed ex... |
4 | Google Play | The app is good, but it needs a lot of functio... | 3 | 2023-07-06 16:57:43 | good needs lot functionality example searching... |
... | ... | ... | ... | ... | ... |
32905 | App Store | This killed my dog. Mark zuckerburg strangled ... | 1 | 2023-07-06 01:23:55 | killed dog mark zuckerburg strangled dog gone |
32906 | App Store | Add Search and hashtag like Twitter ! | 1 | 2023-07-19 08:01:06 | add search |
32907 | App Store | bad twister | 1 | 2023-07-17 06:39:13 | bad twister |
32908 | App Store | Yet another trash from Meta. | 1 | 2023-07-07 17:47:16 | yet another trash meta |
32909 | App Store | Nothing special this app is just a copy of twi... | 1 | 2023-07-07 07:01:43 | nothing special copy |
32910 rows × 5 columns
word_counts = data['processed_text'].str.split().explode().value_counts()
word_count_df = pd.DataFrame(word_counts).reset_index()
word_count_df.columns = ['word', 'frequency']
word_count_df.head(10)
word | frequency | |
---|---|---|
0 | good | 4343 |
1 | 's | 3737 |
2 | ’ | 2705 |
3 | nice | 2233 |
4 | better | 1761 |
5 | ... | 1689 |
6 | use | 1407 |
7 | follow | 1369 |
8 | ca | 1322 |
9 | great | 1229 |
# let's get insight into some data points.
indicies = random.choices([i for i in range(0, data.shape[0])], k = 10)
for index in indicies:
print("Text:", data['processed_text'][index])
print("rating: ", data['rating'][index])
print("-"*100)
Text: posted one day everything stopped working go someone else ’ also says “ content available ” ’ tried many things fix ’ fixed ’ point since ’ working even delete 👍🏻 rating: 1 ---------------------------------------------------------------------------------------------------- Text: good rating: 5 ---------------------------------------------------------------------------------------------------- Text: good 👌👌👌👌😊😊 rating: 5 ---------------------------------------------------------------------------------------------------- Text: totally insane freedom expression meta products also leaks data bad recommended rating: 1 ---------------------------------------------------------------------------------------------------- Text: great rating: 4 ---------------------------------------------------------------------------------------------------- Text: rating: 4 ---------------------------------------------------------------------------------------------------- Text: ca open others think still better ui perfect still better fix interface unblock andrew tate use rating: 1 ---------------------------------------------------------------------------------------------------- Text: 's great rating: 5 ---------------------------------------------------------------------------------------------------- Text: need rating: 1 ---------------------------------------------------------------------------------------------------- Text: rubbish properly optimized rating: 1 ----------------------------------------------------------------------------------------------------
Let's see the words distribution for each one of the ratings¶
data[['processed_text','rating']]
processed_text | rating | |
---|---|---|
0 | meh greatest experience chromebook seems custo... | 2 |
1 | pretty good first launch easy use self explana... | 3 |
2 | brand new 's well optimized however 's missing... | 3 |
3 | great lot potential however lot needs fixed ex... | 3 |
4 | good needs lot functionality example searching... | 3 |
... | ... | ... |
32905 | killed dog mark zuckerburg strangled dog gone | 1 |
32906 | add search | 1 |
32907 | bad twister | 1 |
32908 | yet another trash meta | 1 |
32909 | nothing special copy | 1 |
32910 rows × 2 columns
def visualize(rating):
rating_df = data[data['rating'] == rating]
# Count word frequencies
word_counts = rating_df['processed_text'].str.split().explode().value_counts()
word_count_df = pd.DataFrame(word_counts).reset_index()
word_count_df.columns = ['word', 'frequency']
word_count_df_vis = word_count_df.head(30)
plt.figure(figsize=(12,6))
sns.barplot(x="word", y="frequency", data=word_count_df_vis)
plt.title(f"Top 10 Words in Reviews with Rating = {rating}")
plt.xticks(rotation=45)
plt.show()
visualize(1) # rating = 1
visualize(3) # rating = 3
visualize(5) # rating = 5
From the visualization, it is evident that there's a distinct distribution of words corresponding to the respective ratings. Words like 'good,' 'great,' and 'amazing' predominantly appear in reviews with a rating of 5. Conversely, terms such as 'worse,' 'better,' 'delete,' and 'need' are more prevalent in reviews with a rating of 1.
Dealing with unbalanced class distribution¶
In order to rectify the uneven distribution observed amongst the categories, we propose a restructuring of the rating system. Specifically, ratings 3 and 4 will be amalgamated under the 'Good' category, rating 5 will stand alone as the 'Excellent' category, and ratings 1 and 2 will be combined under the 'Below Par' category.
def new_label(point):
if point == 4 or point == 3:
return "good"
elif point == 1 or point == 2:
return "below par"
else:
return "excellent"
data['label'] = data['rating'].apply(lambda r: new_label(r))
data
source | review_description | rating | review_date | processed_text | label | |
---|---|---|---|---|---|---|
0 | Google Play | Meh. Not the greatest experience on a Chromebo... | 2 | 2023-07-08 14:18:24 | meh greatest experience chromebook seems custo... | below par |
1 | Google Play | Pretty good for a first launch!! Its easy to u... | 3 | 2023-07-19 20:52:48 | pretty good first launch easy use self explana... | good |
2 | Google Play | For a brand new app, it's very well optimized.... | 3 | 2023-07-06 23:03:11 | brand new 's well optimized however 's missing... | good |
3 | Google Play | Great app with a lot of potential! However, th... | 3 | 2023-07-10 00:53:25 | great lot potential however lot needs fixed ex... | good |
4 | Google Play | The app is good, but it needs a lot of functio... | 3 | 2023-07-06 16:57:43 | good needs lot functionality example searching... | good |
... | ... | ... | ... | ... | ... | ... |
32905 | App Store | This killed my dog. Mark zuckerburg strangled ... | 1 | 2023-07-06 01:23:55 | killed dog mark zuckerburg strangled dog gone | below par |
32906 | App Store | Add Search and hashtag like Twitter ! | 1 | 2023-07-19 08:01:06 | add search | below par |
32907 | App Store | bad twister | 1 | 2023-07-17 06:39:13 | bad twister | below par |
32908 | App Store | Yet another trash from Meta. | 1 | 2023-07-07 17:47:16 | yet another trash meta | below par |
32909 | App Store | Nothing special this app is just a copy of twi... | 1 | 2023-07-07 07:01:43 | nothing special copy | below par |
32910 rows × 6 columns
sns.displot(data, x='label', shrink=0.7 ,hue='source', multiple="dodge", discrete=True)
plt.title("label distribution")
plt.show()
Although the balance has not improved 100%, but we will settle for this result now.
Words Visualization Across our 'Excellent' and 'Below Par'¶
In our pursuit to understand the linguistic distinctions between top-tier reviews and those that are less favorable, we've visualized the words frequently used in each category. Leveraging advanced text processing techniques, we mapped high-dimensional word vectors onto a 2D plane, offering a snapshot of the linguistic landscape inherent to each category.
The visualization provides a bird's-eye view of word clusters, demonstrating the unique lexical tendencies of each review classification. Words located closely together in the visual space suggest thematic similarity, revealing patterns of praise or criticism that may be indicative of product strengths or areas needing improvement.
Not only does this visualization serve as a testament to the power of data visualization in Natural Language Processing, but it also gives us actionable insights. By understanding the words and themes that resonate most strongly with each label, we're better positioned to gauge customer sentiment and improve product experiences based on this feedback.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns
num_samples = 50
excellent_samples = data[data['label'] == 'excellent'].sample(num_samples)
below_par_samples = data[data['label'] == 'below par'].sample(num_samples)
combined_samples = pd.concat([excellent_samples, below_par_samples])
# Split the samples into individual words
all_words = []
all_labels = []
for idx, row in combined_samples.iterrows():
for word in row['processed_text'].split():
all_words.append(word)
all_labels.append(row['label'])
# Step 2: Vectorization
vectorizer = TfidfVectorizer(max_features=5000) # limiting to 5000 words for performance reasons
word_vectors = vectorizer.fit_transform(all_words)
# Step 3: Dimensionality Reduction using t-SNE
tsne = TSNE(n_components=2, random_state=0)
word_vectors_2d = tsne.fit_transform(word_vectors.toarray())
# Step 4: Visualization
df_plot = pd.DataFrame(word_vectors_2d, columns=['x', 'y'])
df_plot['label'] = all_labels
df_plot['word'] = all_words
plt.figure(figsize=(20,14))
sns.scatterplot(data=df_plot, x='x', y='y', hue='label', palette="deep", alpha=0.7)
# Annotate points with words
for i, word in enumerate(df_plot['word']):
plt.annotate(word, (df_plot['x'].iloc[i], df_plot['y'].iloc[i]), fontsize=8, alpha=0.5)
plt.title('Visualization of Word Embeddings using t-SNE')
plt.show()
Let's build our model¶
data
source | review_description | rating | review_date | processed_text | label | |
---|---|---|---|---|---|---|
0 | Google Play | Meh. Not the greatest experience on a Chromebo... | 2 | 2023-07-08 14:18:24 | meh greatest experience chromebook seems custo... | below par |
1 | Google Play | Pretty good for a first launch!! Its easy to u... | 3 | 2023-07-19 20:52:48 | pretty good first launch easy use self explana... | good |
2 | Google Play | For a brand new app, it's very well optimized.... | 3 | 2023-07-06 23:03:11 | brand new 's well optimized however 's missing... | good |
3 | Google Play | Great app with a lot of potential! However, th... | 3 | 2023-07-10 00:53:25 | great lot potential however lot needs fixed ex... | good |
4 | Google Play | The app is good, but it needs a lot of functio... | 3 | 2023-07-06 16:57:43 | good needs lot functionality example searching... | good |
... | ... | ... | ... | ... | ... | ... |
32905 | App Store | This killed my dog. Mark zuckerburg strangled ... | 1 | 2023-07-06 01:23:55 | killed dog mark zuckerburg strangled dog gone | below par |
32906 | App Store | Add Search and hashtag like Twitter ! | 1 | 2023-07-19 08:01:06 | add search | below par |
32907 | App Store | bad twister | 1 | 2023-07-17 06:39:13 | bad twister | below par |
32908 | App Store | Yet another trash from Meta. | 1 | 2023-07-07 17:47:16 | yet another trash meta | below par |
32909 | App Store | Nothing special this app is just a copy of twi... | 1 | 2023-07-07 07:01:43 | nothing special copy | below par |
32910 rows × 6 columns
data_for_model = data[['processed_text', 'label']]
data_for_model
processed_text | label | |
---|---|---|
0 | meh greatest experience chromebook seems custo... | below par |
1 | pretty good first launch easy use self explana... | good |
2 | brand new 's well optimized however 's missing... | good |
3 | great lot potential however lot needs fixed ex... | good |
4 | good needs lot functionality example searching... | good |
... | ... | ... |
32905 | killed dog mark zuckerburg strangled dog gone | below par |
32906 | add search | below par |
32907 | bad twister | below par |
32908 | yet another trash meta | below par |
32909 | nothing special copy | below par |
32910 rows × 2 columns
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(list(data_for_model['processed_text']))
feature_valeus = X.toarray()
feature_names = vectorizer.get_feature_names_out()
X_data = pd.DataFrame(feature_valeus, columns=feature_names)
print(X_data.sum(axis=0))
000 1.275025 06 1.690821 07 5.078054 10 43.037831 100 23.944039 ... ㅎㅎ 0.738098 𝒂𝒏𝒅 0.968459 𝒂𝒑𝒑 1.155050 𝒕𝒐 0.812637 𝚝𝚘 1.000000 Length: 5000, dtype: float64
X_data['label'] = data_for_model['label']
X = X_data.drop('label', axis=1)
y = X_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Logistic Regression¶
class_weights = {'below par':2, "excellent":1, "good":4}
log_reg = LogisticRegression(max_iter=10000, class_weight=class_weights) # Increasing max_iter for convergence with high dimensional data
# Fit the model
log_reg.fit(X_train, y_train)
# Predicting on the test set
y_pred = log_reg.predict(X_test)
# Printing out the classification report
print(classification_report(y_test, y_pred))
precision recall f1-score support below par 0.77 0.72 0.74 2310 excellent 0.79 0.73 0.76 3137 good 0.37 0.49 0.42 1135 accuracy 0.68 6582 macro avg 0.64 0.64 0.64 6582 weighted avg 0.71 0.68 0.69 6582
Naive Bayes¶
params = {
'alpha' : [1e-9, 1e-5, 1e-3]
}
NB = MultinomialNB()
grid_search = GridSearchCV(NB, params, cv=5, scoring='f1_micro')
grid_search.fit(X_train, y_train)
# Predicting on the test set
y_pred = grid_search.predict(X_test)
# Printing out the classification report
print(classification_report(y_test, y_pred))
precision recall f1-score support below par 0.74 0.75 0.75 2310 excellent 0.75 0.85 0.80 3137 good 0.47 0.28 0.35 1135 accuracy 0.72 6582 macro avg 0.65 0.63 0.63 6582 weighted avg 0.70 0.72 0.70 6582
Simple and Bad Linear NN for the initial phase, later we will use LSTM NN¶
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_train_onehot = to_categorical(y_train_encoded)
# Encode the Labels for y_test
y_test_encoded = le.transform(y_test)
y_test_onehot = to_categorical(y_test_encoded)
# Convert X_test, X_train to numpy array
X_test_np = X_test.values
X_train_np = X_train.values
# 3. Adjust the Neural Network Architecture
model = Sequential([
Dense(1000, activation='relu', input_shape=(5000,)), # Input layer
Dense(500, activation='relu'), # Hidden layer 1
tf.keras.layers.Dropout(0.5),
Dense(250, activation='relu'), # Hidden layer 2
tf.keras.layers.Dropout(0.5),
Dense(3, activation='softmax') # Output layer
])
# Compile with Categorical Crossentropy
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
model.summary()
# Fit the model
histort = model.fit(X_train_np, y_train_onehot, epochs=10, batch_size=32, validation_split=0.2)
# Evaluate the model
loss, accuracy = model.evaluate(X_test_np, y_test_onehot)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 1000) 5001000 dense_1 (Dense) (None, 500) 500500 dropout (Dropout) (None, 500) 0 dense_2 (Dense) (None, 250) 125250 dropout_1 (Dropout) (None, 250) 0 dense_3 (Dense) (None, 3) 753 ================================================================= Total params: 5,627,503 Trainable params: 5,627,503 Non-trainable params: 0 _________________________________________________________________ Epoch 1/10 659/659 [==============================] - 9s 5ms/step - loss: 0.7419 - accuracy: 0.7061 - val_loss: 0.6641 - val_accuracy: 0.7398 Epoch 2/10 659/659 [==============================] - 3s 5ms/step - loss: 0.5643 - accuracy: 0.7937 - val_loss: 0.7029 - val_accuracy: 0.7374 Epoch 3/10 659/659 [==============================] - 3s 4ms/step - loss: 0.4310 - accuracy: 0.8523 - val_loss: 0.7831 - val_accuracy: 0.7271 Epoch 4/10 659/659 [==============================] - 3s 4ms/step - loss: 0.3320 - accuracy: 0.8889 - val_loss: 1.0579 - val_accuracy: 0.7112 Epoch 5/10 659/659 [==============================] - 3s 4ms/step - loss: 0.2839 - accuracy: 0.9054 - val_loss: 1.1493 - val_accuracy: 0.7224 Epoch 6/10 659/659 [==============================] - 3s 4ms/step - loss: 0.2640 - accuracy: 0.9112 - val_loss: 1.2902 - val_accuracy: 0.7224 Epoch 7/10 659/659 [==============================] - 3s 4ms/step - loss: 0.2549 - accuracy: 0.9126 - val_loss: 1.4380 - val_accuracy: 0.7252 Epoch 8/10 659/659 [==============================] - 3s 4ms/step - loss: 0.2482 - accuracy: 0.9148 - val_loss: 1.4782 - val_accuracy: 0.7220 Epoch 9/10 659/659 [==============================] - 3s 4ms/step - loss: 0.2427 - accuracy: 0.9168 - val_loss: 1.5080 - val_accuracy: 0.7216 Epoch 10/10 659/659 [==============================] - 3s 4ms/step - loss: 0.2386 - accuracy: 0.9179 - val_loss: 1.5869 - val_accuracy: 0.7235 206/206 [==============================] - 1s 2ms/step - loss: 1.6188 - accuracy: 0.7136 Test Loss: 1.618788242340088 Test Accuracy: 0.7136128544807434
train_accuracy = histort.history['accuracy']
val_accuracy = histort.history['val_accuracy']
train_loss = histort.history['loss']
val_loss = histort.history['val_loss']
# Plotting the training and validation accuracy
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
# Plotting the training and validation loss
plt.subplot(1, 2, 2)
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.tight_layout()
plt.show()
The neural network was trained for 10 epochs, and the progress across these epochs provides key insights into the model's performance and potential areas of improvement.
Initially, in the first epoch, the model reported a training accuracy of approximately 70.66% and a loss of 0.7366. On the validation set, the accuracy achieved was 74.71% with a validation loss of 0.6568. This discrepancy between training and validation metrics suggests that the model started with a slightly better generalization capability on the validation set.
However, as the training progressed, the training accuracy steadily improved, reaching up to 92.18% by the 10th epoch, while the corresponding loss decreased to 0.2220. This indicates that the model was learning and fitting to the training data more effectively over successive epochs.
In contrast, the validation accuracy peaked at 74.71% in the first epoch and fluctuated slightly throughout the training. By the end of the 10th epoch, it settled at 72.39%. Simultaneously, the validation loss initially recorded at 0.6568, progressively increased and peaked at 1.6565 in the last epoch. This rising validation loss and stagnant validation accuracy suggest that the model might be overfitting to the training data. In essence, while the model is getting better at predicting the training data, it struggles to generalize to new, unseen data as effectively.
The overfitting is further evidenced by the gap between the training and validation metrics, especially in the later epochs. As the epochs progressed, the training accuracy continued to improve, while the validation accuracy did not show a similar trend and remained relatively static.
In conclusion, while the neural network demonstrates a commendable performance on the training data, there are clear signs of overfitting. To enhance the model's generalization capabilities on unseen data, strategies such as further regularization, altering the network architecture, or data augmentation might be explored.