# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/threads-an-instagram-app-reviews/threads_reviews.csv

!pip install googletrans==4.0.0rc1

Collecting googletrans==4.0.0rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... done
Collecting httpx==0.13.3 (from googletrans==4.0.0rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.1/55.1 kB 988.0 kB/s eta 0:00:00a 0:00:01
Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from httpx==0.13.3->googletrans==4.0.0rc1) (2023.5.7)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading hstspreload-2024.8.1-py3-none-any.whl (1.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 7.9 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx==0.13.3->googletrans==4.0.0rc1) (1.3.0)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 133.4/133.4 kB 12.2 MB/s eta 0:00:00
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.8/58.8 kB 4.8 MB/s eta 0:00:00
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.6/42.6 kB 3.3 MB/s eta 0:00:00
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.6/53.6 kB 4.9 MB/s eta 0:00:00
Collecting h2==3.* (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 65.0/65.0 kB 6.2 MB/s eta 0:00:00
Collecting hyperframe<6,>=5.2.0 (from h2==3.*->httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading hyperframe-5.2.0-py2.py3-none-any.whl (12 kB)
Collecting hpack<4,>=3.0 (from h2==3.*->httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0rc1)
  Downloading hpack-3.0.0-py2.py3-none-any.whl (38 kB)
Building wheels for collected packages: googletrans
  Building wheel for googletrans (setup.py) ... done
  Created wheel for googletrans: filename=googletrans-4.0.0rc1-py3-none-any.whl size=17414 sha256=384316c1ffdc4d18fd046961935b5dc8e79c46922d8dca6a319aae272414d669
  Stored in directory: /root/.cache/pip/wheels/c0/59/9f/7372f0cf70160fe61b528532e1a7c8498c4becd6bcffb022de
Successfully built googletrans
Installing collected packages: rfc3986, hyperframe, hpack, h11, chardet, idna, hstspreload, h2, httpcore, httpx, googletrans
  Attempting uninstall: h11
    Found existing installation: h11 0.14.0
    Uninstalling h11-0.14.0:
      Successfully uninstalled h11-0.14.0
  Attempting uninstall: idna
    Found existing installation: idna 3.4
    Uninstalling idna-3.4:
      Successfully uninstalled idna-3.4
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab-lsp 4.2.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
ydata-profiling 4.3.1 requires scipy<1.11,>=1.4.1, but you have scipy 1.11.1 which is incompatible.
Successfully installed chardet-3.0.4 googletrans-4.0.0rc1 h11-0.9.0 h2-3.2.0 hpack-3.0.0 hstspreload-2024.8.1 httpcore-0.9.1 httpx-0.13.3 hyperframe-5.2.0 idna-2.10 rfc3986-1.5.0

!pip install emot

Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.5/61.5 kB 640.5 kB/s eta 0:00:00a 0:00:01
Installing collected packages: emot
Successfully installed emot-3.1

!pip install emoji

Requirement already satisfied: emoji in /opt/conda/lib/python3.10/site-packages (2.6.0)

# # import emot
# # text = "hey🤣"

# # emot_object = emot.core.emot()
# # emot_object.emoji(text)
# # to_be_rep = emot_object.emoji(text)['value'][0]
# # rep_to = emot_object.emoji(text)['mean'][0]
# # re.sub(rf"{to_be_rep}", rep_to, text)


# import emoji

# text = 'I love Python ❤️, it is brilliant 👍'
# text2 = ":red_heartface_blowing_a_kissheart_with_ribbonsmiling_face_with_hearts:"
# print(emoji.demojize(text))
# print(emoji.emojize(text2))

import warnings
warnings.filterwarnings("ignore")

# -------------------------------------------------------
# For text analysis
# import spacy 
# nlp = spacy.load("en_core_web_sm")

import re # for regular expression
import string
from googletrans import Translator, LANGUAGES

# Initialize the translator
translator = Translator()

puncs = string.punctuation
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
porter_stem = PorterStemmer()
stops = list(set(stopwords.words('english'))) # set of stopwords from the NLTK tool
defined_stop_words = ['app', 'instagram',  'twitter', 'facebook', 'account', 'profile', 'follower', 'followers',
                     'tweet', 'post', 'like', 'status', 'notification','comment', 'video', 'post', 'picture', 
                     'tag', 'hashtag', 'story', 'status', 'notification', 'dm', 'message', 'mention', 'threads',
                     'application', 'insta', 'whatsapp', 'user', "n't", 'could', 'may', 'must', "i", "followe",
                     "people", "see", "posts"]

defined_stop_words+= list(puncs)

stops += defined_stop_words

import emot
emot_object = emot.core.emot()
import emoji
# -------------------------------------------------------

import random
# -------------------------------------------------------

# for data visualization
import matplotlib.pyplot as plt 
import seaborn as sns
# -------------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

%time
data = pd.read_csv("/kaggle/input/threads-an-instagram-app-reviews/threads_reviews.csv")
display(data.head())
print(f"Number of data points= {data.shape[0]}\nNumber of features= {data.shape[1]}")

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.63 µs

Number of data points= 32910
Number of features= 4

data.shape

(32910, 4)

# let's get insight into some data points.
indicies = random.choices([i for i in range(0, data.shape[0])], k = 10)

for index in indicies:
    print("Text:", data['review_description'][index])
    print("rating: ", data['rating'][index])
    print("-"*100)

Text: it has the potential to be way better than twitter (which to be fair, it’s not that hard with how it’s current state is under elon) but there are some features that twitter has that are useful that aren’t on threads, for example an indicator on one’s profile saying if they are following you or not, also dms, polls, and many more options that can come in future updates. i feel like it would’ve been smarter to get more features like those in before release date
rating:  4
----------------------------------------------------------------------------------------------------
Text: Nice 🙏🙏🙏
rating:  5
----------------------------------------------------------------------------------------------------
Text: Nicee
rating:  4
----------------------------------------------------------------------------------------------------
Text: So aku orang Malaysia pertama yang comment kat thread ni? 🇲🇾 ( I'm 1st Malaysian to comment and review here) ❗Subscribe my youtube channel (FakFitness) 😂
rating:  5
----------------------------------------------------------------------------------------------------
Text: Hashtags, please!!
rating:  4
----------------------------------------------------------------------------------------------------
Text: Fix the gallery selections
rating:  1
----------------------------------------------------------------------------------------------------
Text: ऐलन मस्क तो गया अब 😆😆😆😆
rating:  5
----------------------------------------------------------------------------------------------------
Text: Normal
rating:  4
----------------------------------------------------------------------------------------------------
Text: It's new give experience
rating:  5
----------------------------------------------------------------------------------------------------
Text: Best app
rating:  5
----------------------------------------------------------------------------------------------------

data.describe().T

sns.displot(data, x='rating', shrink=0.7 ,hue='source', multiple="dodge", discrete=True)
plt.title("Ratings distribution")
plt.show()

word_counts = data['review_description'].str.split().explode().value_counts()

word_count_df = pd.DataFrame(word_counts).reset_index()
word_count_df.columns = ['word', 'frequency']
word_count_df.head(15)

def translate_to_english(text):
    translated = translator.translate(text, dest='en')
    return translated.text


def text_process(text):
    text = text.lower() # case folding to our text 
    text_tokens = nltk.word_tokenize(text) # tokenizing our text
    text_no_stop = [token.lower() for token in text_tokens if token not in stops] # removing stop words
    text_stemmed = [porter_stem.stem(token) for token in text_no_stop] # stemming
    text = " ".join(text_no_stop) 
    text = re.sub("-", " ", text) # removing -
    #text = emoji.demojize(text) # give the emojis a textual meaning, COMMENT THIS LINE WHEN YOU WANT TO VISUALIZE WORDS IN SCATTER PLOT ABOVE
    text = re.sub(":", "",text) 
    
    return text

    
data['processed_text'] = data['review_description'].apply(lambda text : text_process(text))

data

word_counts = data['processed_text'].str.split().explode().value_counts()

word_count_df = pd.DataFrame(word_counts).reset_index()
word_count_df.columns = ['word', 'frequency']
word_count_df.head(10)

# let's get insight into some data points.
indicies = random.choices([i for i in range(0, data.shape[0])], k = 10)

for index in indicies:
    print("Text:", data['processed_text'][index])
    print("rating: ", data['rating'][index])
    print("-"*100)

Text: posted one day everything stopped working go someone else ’ also says “ content available ” ’ tried many things fix ’ fixed ’ point since ’ working even delete 👍🏻
rating:  1
----------------------------------------------------------------------------------------------------
Text: good
rating:  5
----------------------------------------------------------------------------------------------------
Text: good 👌👌👌👌😊😊
rating:  5
----------------------------------------------------------------------------------------------------
Text: totally insane freedom expression meta products also leaks data bad recommended
rating:  1
----------------------------------------------------------------------------------------------------
Text: great
rating:  4
----------------------------------------------------------------------------------------------------
Text: 
rating:  4
----------------------------------------------------------------------------------------------------
Text: ca open others think still better ui perfect still better fix interface unblock andrew tate use
rating:  1
----------------------------------------------------------------------------------------------------
Text: 's great
rating:  5
----------------------------------------------------------------------------------------------------
Text: need
rating:  1
----------------------------------------------------------------------------------------------------
Text: rubbish properly optimized
rating:  1
----------------------------------------------------------------------------------------------------

data[['processed_text','rating']]

def visualize(rating):
    rating_df = data[data['rating'] == rating]
    
    # Count word frequencies
    word_counts = rating_df['processed_text'].str.split().explode().value_counts()

    word_count_df = pd.DataFrame(word_counts).reset_index()
    word_count_df.columns = ['word', 'frequency']
    word_count_df_vis = word_count_df.head(30)

    plt.figure(figsize=(12,6))
    sns.barplot(x="word", y="frequency", data=word_count_df_vis)
    plt.title(f"Top 10 Words in Reviews with Rating = {rating}")
    plt.xticks(rotation=45)
    plt.show()

visualize(1) # rating = 1
visualize(3) # rating = 3
visualize(5) # rating = 5

def new_label(point):
    if point == 4 or point == 3:
        return "good"
    elif point == 1 or point == 2:
        return "below par"
    else:
        return "excellent"

data['label'] = data['rating'].apply(lambda r: new_label(r))
data

sns.displot(data, x='label', shrink=0.7 ,hue='source', multiple="dodge", discrete=True)
plt.title("label distribution")
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import seaborn as sns


num_samples = 50

excellent_samples = data[data['label'] == 'excellent'].sample(num_samples)
below_par_samples = data[data['label'] == 'below par'].sample(num_samples)

combined_samples = pd.concat([excellent_samples, below_par_samples])

# Split the samples into individual words
all_words = []
all_labels = []
for idx, row in combined_samples.iterrows():
    for word in row['processed_text'].split():
        all_words.append(word)
        all_labels.append(row['label'])

# Step 2: Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # limiting to 5000 words for performance reasons
word_vectors = vectorizer.fit_transform(all_words)

# Step 3: Dimensionality Reduction using t-SNE
tsne = TSNE(n_components=2, random_state=0)
word_vectors_2d = tsne.fit_transform(word_vectors.toarray())

# Step 4: Visualization
df_plot = pd.DataFrame(word_vectors_2d, columns=['x', 'y'])
df_plot['label'] = all_labels
df_plot['word'] = all_words

plt.figure(figsize=(20,14))
sns.scatterplot(data=df_plot, x='x', y='y', hue='label', palette="deep", alpha=0.7)

# Annotate points with words
for i, word in enumerate(df_plot['word']):
    plt.annotate(word, (df_plot['x'].iloc[i], df_plot['y'].iloc[i]), fontsize=8, alpha=0.5)

plt.title('Visualization of Word Embeddings using t-SNE')
plt.show()

data

data_for_model = data[['processed_text', 'label']]
data_for_model

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(list(data_for_model['processed_text']))

feature_valeus = X.toarray()
feature_names = vectorizer.get_feature_names_out()

X_data = pd.DataFrame(feature_valeus, columns=feature_names)
print(X_data.sum(axis=0))

000     1.275025
06      1.690821
07      5.078054
10     43.037831
100    23.944039
         ...    
ㅎㅎ      0.738098
𝒂𝒏𝒅     0.968459
𝒂𝒑𝒑     1.155050
𝒕𝒐      0.812637
𝚝𝚘      1.000000
Length: 5000, dtype: float64

X_data['label'] = data_for_model['label']

X = X_data.drop('label', axis=1)
y = X_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class_weights = {'below par':2, "excellent":1, "good":4}
log_reg = LogisticRegression(max_iter=10000, class_weight=class_weights)  # Increasing max_iter for convergence with high dimensional data

# Fit the model
log_reg.fit(X_train, y_train)

# Predicting on the test set
y_pred = log_reg.predict(X_test)

# Printing out the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   below par       0.77      0.72      0.74      2310
   excellent       0.79      0.73      0.76      3137
        good       0.37      0.49      0.42      1135

    accuracy                           0.68      6582
   macro avg       0.64      0.64      0.64      6582
weighted avg       0.71      0.68      0.69      6582

params = {
    'alpha' : [1e-9, 1e-5, 1e-3]
}

NB = MultinomialNB()
grid_search = GridSearchCV(NB, params, cv=5, scoring='f1_micro')

grid_search.fit(X_train, y_train)

# Predicting on the test set
y_pred = grid_search.predict(X_test)

# Printing out the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   below par       0.74      0.75      0.75      2310
   excellent       0.75      0.85      0.80      3137
        good       0.47      0.28      0.35      1135

    accuracy                           0.72      6582
   macro avg       0.65      0.63      0.63      6582
weighted avg       0.70      0.72      0.70      6582

from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense


le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_train_onehot = to_categorical(y_train_encoded)

# Encode the Labels for y_test
y_test_encoded = le.transform(y_test)
y_test_onehot = to_categorical(y_test_encoded)

#  Convert X_test, X_train to numpy array
X_test_np = X_test.values
X_train_np = X_train.values

# 3. Adjust the Neural Network Architecture
model = Sequential([
    Dense(1000, activation='relu', input_shape=(5000,)),   # Input layer
    Dense(500, activation='relu'),                        # Hidden layer 1 
    tf.keras.layers.Dropout(0.5),
    Dense(250, activation='relu'),                         # Hidden layer 2 
    tf.keras.layers.Dropout(0.5),
    Dense(3, activation='softmax')                        # Output layer
])

# Compile with Categorical Crossentropy
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()
#  Fit the model
histort = model.fit(X_train_np, y_train_onehot, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_np, y_test_onehot)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense (Dense)               (None, 1000)              5001000   
                                                                 
 dense_1 (Dense)             (None, 500)               500500    
                                                                 
 dropout (Dropout)           (None, 500)               0         
                                                                 
 dense_2 (Dense)             (None, 250)               125250    
                                                                 
 dropout_1 (Dropout)         (None, 250)               0         
                                                                 
 dense_3 (Dense)             (None, 3)                 753       
                                                                 
=================================================================
Total params: 5,627,503
Trainable params: 5,627,503
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
659/659 [==============================] - 9s 5ms/step - loss: 0.7419 - accuracy: 0.7061 - val_loss: 0.6641 - val_accuracy: 0.7398
Epoch 2/10
659/659 [==============================] - 3s 5ms/step - loss: 0.5643 - accuracy: 0.7937 - val_loss: 0.7029 - val_accuracy: 0.7374
Epoch 3/10
659/659 [==============================] - 3s 4ms/step - loss: 0.4310 - accuracy: 0.8523 - val_loss: 0.7831 - val_accuracy: 0.7271
Epoch 4/10
659/659 [==============================] - 3s 4ms/step - loss: 0.3320 - accuracy: 0.8889 - val_loss: 1.0579 - val_accuracy: 0.7112
Epoch 5/10
659/659 [==============================] - 3s 4ms/step - loss: 0.2839 - accuracy: 0.9054 - val_loss: 1.1493 - val_accuracy: 0.7224
Epoch 6/10
659/659 [==============================] - 3s 4ms/step - loss: 0.2640 - accuracy: 0.9112 - val_loss: 1.2902 - val_accuracy: 0.7224
Epoch 7/10
659/659 [==============================] - 3s 4ms/step - loss: 0.2549 - accuracy: 0.9126 - val_loss: 1.4380 - val_accuracy: 0.7252
Epoch 8/10
659/659 [==============================] - 3s 4ms/step - loss: 0.2482 - accuracy: 0.9148 - val_loss: 1.4782 - val_accuracy: 0.7220
Epoch 9/10
659/659 [==============================] - 3s 4ms/step - loss: 0.2427 - accuracy: 0.9168 - val_loss: 1.5080 - val_accuracy: 0.7216
Epoch 10/10
659/659 [==============================] - 3s 4ms/step - loss: 0.2386 - accuracy: 0.9179 - val_loss: 1.5869 - val_accuracy: 0.7235
206/206 [==============================] - 1s 2ms/step - loss: 1.6188 - accuracy: 0.7136
Test Loss: 1.618788242340088
Test Accuracy: 0.7136128544807434

train_accuracy = histort.history['accuracy']
val_accuracy = histort.history['val_accuracy']

train_loss = histort.history['loss']
val_loss = histort.history['val_loss']

# Plotting the training and validation accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_accuracy, label='Training Accuracy')
plt.plot(val_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plotting the training and validation loss
plt.subplot(1, 2, 2)
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

	source	review_description	rating	review_date
0	Google Play	Meh. Not the greatest experience on a Chromebo...	2	2023-07-08 14:18:24
1	Google Play	Pretty good for a first launch!! Its easy to u...	3	2023-07-19 20:52:48
2	Google Play	For a brand new app, it's very well optimized....	3	2023-07-06 23:03:11
3	Google Play	Great app with a lot of potential! However, th...	3	2023-07-10 00:53:25
4	Google Play	The app is good, but it needs a lot of functio...	3	2023-07-06 16:57:43

	source	review_description	rating	review_date	processed_text
0	Google Play	Meh. Not the greatest experience on a Chromebo...	2	2023-07-08 14:18:24	meh greatest experience chromebook seems custo...
1	Google Play	Pretty good for a first launch!! Its easy to u...	3	2023-07-19 20:52:48	pretty good first launch easy use self explana...
2	Google Play	For a brand new app, it's very well optimized....	3	2023-07-06 23:03:11	brand new 's well optimized however 's missing...
3	Google Play	Great app with a lot of potential! However, th...	3	2023-07-10 00:53:25	great lot potential however lot needs fixed ex...
4	Google Play	The app is good, but it needs a lot of functio...	3	2023-07-06 16:57:43	good needs lot functionality example searching...
...	...	...	...	...	...
32905	App Store	This killed my dog. Mark zuckerburg strangled ...	1	2023-07-06 01:23:55	killed dog mark zuckerburg strangled dog gone
32906	App Store	Add Search and hashtag like Twitter !	1	2023-07-19 08:01:06	add search
32907	App Store	bad twister	1	2023-07-17 06:39:13	bad twister
32908	App Store	Yet another trash from Meta.	1	2023-07-07 17:47:16	yet another trash meta
32909	App Store	Nothing special this app is just a copy of twi...	1	2023-07-07 07:01:43	nothing special copy

	processed_text	rating
0	meh greatest experience chromebook seems custo...	2
1	pretty good first launch easy use self explana...	3
2	brand new 's well optimized however 's missing...	3
3	great lot potential however lot needs fixed ex...	3
4	good needs lot functionality example searching...	3
...	...	...
32905	killed dog mark zuckerburg strangled dog gone	1
32906	add search	1
32907	bad twister	1
32908	yet another trash meta	1
32909	nothing special copy	1

	source	review_description	rating	review_date	processed_text	label
0	Google Play	Meh. Not the greatest experience on a Chromebo...	2	2023-07-08 14:18:24	meh greatest experience chromebook seems custo...	below par
1	Google Play	Pretty good for a first launch!! Its easy to u...	3	2023-07-19 20:52:48	pretty good first launch easy use self explana...	good
2	Google Play	For a brand new app, it's very well optimized....	3	2023-07-06 23:03:11	brand new 's well optimized however 's missing...	good
3	Google Play	Great app with a lot of potential! However, th...	3	2023-07-10 00:53:25	great lot potential however lot needs fixed ex...	good
4	Google Play	The app is good, but it needs a lot of functio...	3	2023-07-06 16:57:43	good needs lot functionality example searching...	good
...	...	...	...	...	...	...
32905	App Store	This killed my dog. Mark zuckerburg strangled ...	1	2023-07-06 01:23:55	killed dog mark zuckerburg strangled dog gone	below par
32906	App Store	Add Search and hashtag like Twitter !	1	2023-07-19 08:01:06	add search	below par
32907	App Store	bad twister	1	2023-07-17 06:39:13	bad twister	below par
32908	App Store	Yet another trash from Meta.	1	2023-07-07 17:47:16	yet another trash meta	below par
32909	App Store	Nothing special this app is just a copy of twi...	1	2023-07-07 07:01:43	nothing special copy	below par

	source	review_description	rating	review_date	processed_text	label
0	Google Play	Meh. Not the greatest experience on a Chromebo...	2	2023-07-08 14:18:24	meh greatest experience chromebook seems custo...	below par
1	Google Play	Pretty good for a first launch!! Its easy to u...	3	2023-07-19 20:52:48	pretty good first launch easy use self explana...	good
2	Google Play	For a brand new app, it's very well optimized....	3	2023-07-06 23:03:11	brand new 's well optimized however 's missing...	good
3	Google Play	Great app with a lot of potential! However, th...	3	2023-07-10 00:53:25	great lot potential however lot needs fixed ex...	good
4	Google Play	The app is good, but it needs a lot of functio...	3	2023-07-06 16:57:43	good needs lot functionality example searching...	good
...	...	...	...	...	...	...
32905	App Store	This killed my dog. Mark zuckerburg strangled ...	1	2023-07-06 01:23:55	killed dog mark zuckerburg strangled dog gone	below par
32906	App Store	Add Search and hashtag like Twitter !	1	2023-07-19 08:01:06	add search	below par
32907	App Store	bad twister	1	2023-07-17 06:39:13	bad twister	below par
32908	App Store	Yet another trash from Meta.	1	2023-07-07 17:47:16	yet another trash meta	below par
32909	App Store	Nothing special this app is just a copy of twi...	1	2023-07-07 07:01:43	nothing special copy	below par

How Much Do We Love threads?¶

An initial analysis and models, soon we will dive deeper¶

Insights into our dataset¶

Ratings histogram to get feeling of the ratings distribution¶

as we can see above we have a stopwords that we have to deal with to get better insight.¶

Let's see the words distribution for each one of the ratings¶

Dealing with unbalanced class distribution¶

Words Visualization Across our 'Excellent' and 'Below Par'¶

Let's build our model¶

Logistic Regression¶

Naive Bayes¶

Simple and Bad Linear NN for the initial phase, later we will use LSTM NN¶

Soon we are going explore changes and improvements as well as using more advanced network architectures.¶

Also, we will try another methods for text representation other than the TF-IDF approach.¶

Keep alert :)¶

	word	frequency
0	to	10563
1	I	9707
2	the	9190
3	app	8076
4	and	7565
5	is	6823
6	a	6740
7	it	5584
8	of	4734
9	this	4076
10	for	3067
11	but	2983
12	you	2954
13	Twitter	2947
14	my	2883

	word	frequency
0	good	4343
1	's	3737
2	’	2705
3	nice	2233
4	better	1761
5	...	1689
6	use	1407
7	follow	1369
8	ca	1322
9	great	1229