Quora Insincere Questions Classification¶

Dataset Link: Quora Insincere Questions Classification¶

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
from string import punctuation 
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z3534407/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/z3534407/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[1]:
True
In [3]:
df_train = pd.read_csv("train.csv")
df_test  = pd.read_csv("test.csv")
In [5]:
df_train.head()
Out[5]:
qid question_text target
0 00002165364db923c7e6 How did Quebec nationalists see their province... 0
1 000032939017120e6e44 Do you have an adopted dog, how would you enco... 0
2 0000412ca6e4628ce2cf Why does velocity affect time? Does velocity a... 0
3 000042bf85aa498cd78e How did Otto von Guericke used the Magdeburg h... 0
4 0000455dfa3e01eae3af Can I convert montra helicon D to a mountain b... 0
In [7]:
df_test.head()
Out[7]:
qid question_text
0 0000163e3ea7c7a74cd7 Why do so many women become so rude and arroga...
1 00002bd4fb5d505b9161 When should I apply for RV college of engineer...
2 00007756b4a147d2b0b3 What is it really like to be a nurse practitio...
3 000086e4b7e1c7146103 Who are entrepreneurs?
4 0000c4c3fbe8785a3090 Is education really making good people nowadays?
In [9]:
df_train.shape, df_test.shape
Out[9]:
((1306122, 3), (375806, 2))
In [11]:
df_train["target"].value_counts()
Out[11]:
target
0    1225312
1      80810
Name: count, dtype: int64
In [13]:
insincere = df_train[df_train["target"] == 1]
sincere   = df_train[df_train["target"] == 0]
In [15]:
question_class = df_train["target"].value_counts()
colors = ["#B92B27", "dodgerblue"]
question_class.plot(kind="bar", color=colors, edgecolor="black")
plt.xlabel("Question Class")
plt.ylabel("Number of Questions")
plt.title("Distribution of Question Classes in Training Data")
for i, count in enumerate(question_class):
    plt.text(i, count + 0.1, str(count), ha="center", va="bottom")
plt.show()
No description has been provided for this image
In [17]:
print(df_train["target"].value_counts())
print(sum(df_train["target"] == 1) / sum(df_train["target"] == 0) * 100, "percent of questions are insincere.")
print(100 - sum(df_train["target"] == 1) / sum(df_train["target"] == 0) * 100, "percent of questions are sincere")
target
0    1225312
1      80810
Name: count, dtype: int64
6.595054973753624 percent of questions are insincere.
93.40494502624638 percent of questions are sincere
In [19]:
print("Original Class Distribution:")
print(df_train["target"].value_counts())

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df_train.drop("target", axis=1), df_train["target"])

df_balanced = pd.concat([X_resampled, y_resampled], axis=1)

print("\nBalanced Class Distribution:")
print(df_balanced["target"].value_counts())

insincere_percent = sum(df_balanced["target"] == 1) / len(df_balanced) * 100
sincere_percent = 100 - insincere_percent

print(f"\n{insincere_percent:.2f}% of questions are insincere.")
print(f"{sincere_percent:.2f}% of questions are sincere.")
Original Class Distribution:
target
0    1225312
1      80810
Name: count, dtype: int64

Balanced Class Distribution:
target
0    80810
1    80810
Name: count, dtype: int64

50.00% of questions are insincere.
50.00% of questions are sincere.
In [21]:
df_train = df_balanced.copy()
df_train.shape
Out[21]:
(161620, 3)
In [23]:
stopwords = set(STOPWORDS)
sincere_wordcloud = WordCloud(width=600, height=400).generate(str(sincere["question_text"]))
plt.figure(figsize=(10,8), facecolor="black")
plt.imshow(sincere_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show();
No description has been provided for this image
In [25]:
stopwords = set(STOPWORDS)
insincere_wordcloud = WordCloud(width=600, height=400).generate(str(insincere["question_text"]))
plt.figure(figsize=(10,8), facecolor="black")
plt.imshow(insincere_wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show();
No description has been provided for this image
In [27]:
df_train["number_words"] = df_train["question_text"].apply(lambda x: len(x.split()))
df_test["number_words"]  = df_test["question_text"].apply(lambda x: len(x.split()))
In [29]:
df_train["num_unique_words"] = df_train["question_text"].apply(lambda x: len(set(str(x).split())))
df_test["num_unique_words"]  = df_test["question_text"].apply(lambda x: len(set(str(x).split())))
In [31]:
df_train["num_chars"] = df_train["question_text"].apply(lambda x: len(str(x)))
df_test["num_chars"]  = df_test["question_text"].apply(lambda x: len(str(x)))
In [33]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
stop_words = set(stopwords.words("english"))
df_train["num_stopwords"] = df_train["question_text"].apply(lambda x : len([nw for nw in str(x).split() if nw.lower() in stop_words]))
df_test["num_stopwords"]  = df_test["question_text"].apply(lambda x : len([nw for nw in str(x).split() if nw.lower() in stop_words]))
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z3534407/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [35]:
df_train["num_punctuation"] = df_train["question_text"].apply(lambda x : len([np for np in str(x) if np in punctuation]))
df_test["num_punctuation"]  = df_test["question_text"].apply(lambda x : len([np for np in str(x) if np in punctuation]))
In [37]:
df_train["num_uppercase"] = df_train["question_text"].apply(lambda x : len([nu for nu in str(x).split() if nu.isupper()]))
df_test["num_uppercase"]  = df_test["question_text"].apply(lambda x : len([nu for nu in str(x).split() if nu.isupper()]))
In [39]:
df_train["num_lowercase"] = df_train["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.islower()]))
df_test["num_lowercase"]  = df_test["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.islower()]))
In [41]:
df_train["num_title"] = df_train["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.istitle()]))
df_test["num_title"]  = df_test["question_text"].apply(lambda x : len([nl for nl in str(x).split() if nl.istitle()]))
In [43]:
df_train[df_train["target"] == 1].describe()
Out[43]:
target number_words num_unique_words num_chars num_stopwords num_punctuation num_uppercase num_lowercase num_title
count 80810.0 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000
mean 1.0 17.277812 16.037594 98.064163 8.000829 2.369905 0.326284 13.919453 2.962826
std 0.0 9.568309 8.153619 55.186227 4.918845 2.906119 0.896822 8.661324 1.971440
min 1.0 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.0 10.000000 10.000000 55.000000 4.000000 1.000000 0.000000 7.000000 2.000000
50% 1.0 15.000000 14.000000 86.000000 7.000000 2.000000 0.000000 12.000000 3.000000
75% 1.0 23.000000 21.000000 130.000000 11.000000 3.000000 0.000000 19.000000 4.000000
max 1.0 64.000000 48.000000 1017.000000 37.000000 411.000000 37.000000 56.000000 37.000000
In [45]:
df_train[df_train["target"] == 0].describe()
Out[45]:
target number_words num_unique_words num_chars num_stopwords num_punctuation num_uppercase num_lowercase num_title
count 80810.0 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000 80810.000000
mean 0.0 12.501609 11.876104 68.821643 6.035132 1.705779 0.462542 10.017188 2.072380
std 0.0 6.727994 5.761206 36.553370 3.600207 1.564554 0.850761 6.145395 1.445841
min 0.0 2.000000 2.000000 11.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.0 8.000000 8.000000 44.000000 4.000000 1.000000 0.000000 6.000000 1.000000
50% 0.0 11.000000 10.000000 59.000000 5.000000 1.000000 0.000000 8.000000 2.000000
75% 0.0 15.000000 14.000000 82.000000 7.000000 2.000000 1.000000 12.000000 3.000000
max 0.0 56.000000 47.000000 263.000000 37.000000 96.000000 16.000000 51.000000 21.000000
In [47]:
fig, ax = plt.subplots(figsize=(12, 10))
colors = ["#B92B27", "dodgerblue"]
sns.set_palette(sns.color_palette(colors))
sns.boxplot(data=df_train, y="number_words", x="target", orient="v", ax=ax)
ax.set(xlabel="Target", ylabel="Number of Words", title="Box Plot of Number of Words According to the Target")
plt.show()
No description has been provided for this image
In [49]:
def text_process(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    processed_text = ' '.join(tokens)
    return processed_text
In [51]:
df_train['clean_train'] = df_train["question_text"].apply(text_process)
df_train.head()
Out[51]:
qid question_text target number_words num_unique_words num_chars num_stopwords num_punctuation num_uppercase num_lowercase num_title clean_train
1022714 c86ab618f85e9b7fc374 Is sadism a coping mechanism for people who ar... 0 16 16 94 8 1 0 15 1 sadism coping mechanism people extremely affec...
641364 7d9ea6d66b8866e69240 Is it possible for me as a soul to go outside ... 0 18 18 84 11 1 0 17 1 possible soul go outside body observe universe
1225111 f01982d0cd06aba308ed Do Pet Animal Rescue workers minimize shows of... 0 23 23 143 10 2 0 19 4 pet animal rescue workers minimize shows affec...
1130433 dd8a6b5452a407cea2ac How do you identify a sonnet and what can we i... 0 15 14 71 11 1 0 14 1 identify sonnet infer suggest
1220402 ef30e73bf0a81a06ccf6 Is there a special place in hell for the likes... 0 13 13 63 7 1 0 10 3 special place hell likes genghiz khan
In [53]:
df_test['clean_test'] = df_test["question_text"].apply(text_process)
df_test
Out[53]:
qid question_text number_words num_unique_words num_chars num_stopwords num_punctuation num_uppercase num_lowercase num_title clean_test
0 0000163e3ea7c7a74cd7 Why do so many women become so rude and arroga... 21 19 101 11 1 0 20 1 many women become rude arrogant get little bit...
1 00002bd4fb5d505b9161 When should I apply for RV college of engineer... 30 23 162 17 2 6 22 5 apply rv college engineering bms college engin...
2 00007756b4a147d2b0b3 What is it really like to be a nurse practitio... 10 10 50 6 1 0 9 1 really like nurse practitioner
3 000086e4b7e1c7146103 Who are entrepreneurs? 3 3 22 2 1 0 2 1 entrepreneurs
4 0000c4c3fbe8785a3090 Is education really making good people nowadays? 7 7 48 1 1 0 6 1 education really making good people nowadays
... ... ... ... ... ... ... ... ... ... ... ...
375801 ffff7fa746bd6d6197a9 How many countries listed in gold import in in... 9 8 50 3 1 0 8 1 many countries listed gold import indua
375802 ffffa1be31c43046ab6b Is there an alternative to dresses on formal p... 9 9 53 5 1 0 8 1 alternative dresses formal parties
375803 ffffae173b6ca6bfa563 Where I can find best friendship quotes in Tel... 9 9 50 4 1 1 6 3 find best friendship quotes telugu
375804 ffffb1f7f1a008620287 What are the causes of refraction of light? 8 7 43 5 1 0 7 1 causes refraction light
375805 fffff85473f4699474b0 Climate change is a worrying topic. How much t... 36 33 189 17 7 2 30 5 climate change worrying topic much time left f...

375806 rows × 11 columns

In [55]:
X_train,X_val,Y_train,Y_val = train_test_split(df_train['clean_train'],df_train['target'],test_size=0.2)
X_train.shape,X_val.shape,Y_train.shape,Y_val.shape
Out[55]:
((129296,), (32324,), (129296,), (32324,))
In [57]:
pipeline = Pipeline(
    [
        ("cv",CountVectorizer(analyzer="word",ngram_range=(1,4),max_df=0.9)),
        ("clf",LogisticRegression(solver="saga", class_weight="balanced", C=0.45, max_iter=250, verbose=1))
    ]
)
In [59]:
lr_model = pipeline.fit(X_train,Y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
convergence after 164 epochs took 28 seconds
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   28.4s finished
In [61]:
Y_pred = lr_model.predict(X_val)
print(classification_report(Y_val,Y_pred))
cm     = confusion_matrix(Y_val,Y_pred)
sns.heatmap(cm, cmap="Blues", annot=True, square=True, fmt=".0f");
              precision    recall  f1-score   support

           0       0.86      0.90      0.88     16084
           1       0.90      0.86      0.88     16240

    accuracy                           0.88     32324
   macro avg       0.88      0.88      0.88     32324
weighted avg       0.88      0.88      0.88     32324

No description has been provided for this image
In [63]:
y_pred_final = pipeline.predict(df_test['clean_test'])
y_pred_final
Out[63]:
array([1, 0, 0, ..., 0, 0, 1])
In [65]:
df_sub = pd.DataFrame({"qid":df_test["qid"], "prediction":y_pred_final})
df_sub.to_csv('submission.csv', index=False)
df_sub.head()
Out[65]:
qid prediction
0 0000163e3ea7c7a74cd7 1
1 00002bd4fb5d505b9161 0
2 00007756b4a147d2b0b3 0
3 000086e4b7e1c7146103 0
4 0000c4c3fbe8785a3090 0
In [ ]: