decision trees¶

In [75]:
import pandas as pd
df = pd.read_csv('wine.csv')
df.info()
df.head(111)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Wine                  178 non-null    int64  
 1   Alcohol               178 non-null    float64
 2   Malic.acid            178 non-null    float64
 3   Ash                   178 non-null    float64
 4   Acl                   178 non-null    float64
 5   Mg                    178 non-null    int64  
 6   Phenols               178 non-null    float64
 7   Flavanoids            178 non-null    float64
 8   Nonflavanoid.phenols  178 non-null    float64
 9   Proanth               178 non-null    float64
 10  Color.int             178 non-null    float64
 11  Hue                   178 non-null    float64
 12  OD                    178 non-null    float64
 13  Proline               178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB
Out[75]:
Wine Alcohol Malic.acid Ash Acl Mg Phenols Flavanoids Nonflavanoid.phenols Proanth Color.int Hue OD Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
106 2 12.25 1.73 2.12 19.0 80 1.65 2.03 0.37 1.63 3.40 1.00 3.17 510
107 2 12.72 1.75 2.28 22.5 84 1.38 1.76 0.48 1.63 3.30 0.88 2.42 488
108 2 12.22 1.29 1.94 19.0 92 2.36 2.04 0.39 2.08 2.70 0.86 3.02 312
109 2 11.61 1.35 2.70 20.0 94 2.74 2.92 0.29 2.49 2.65 0.96 3.26 680
110 2 11.46 3.74 1.82 19.5 107 3.18 2.58 0.24 3.58 2.90 0.75 2.81 562

111 rows × 14 columns

In [44]:
df.head()
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

import numpy as np
from sklearn.model_selection import train_test_split
#rs = np.random.RandomState(123) # doesn't work with DecisionTreeClassifier.
rs = 123
train_prop = 0.7   
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_prop, random_state=rs)
In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
In [73]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(max_depth=3,criterion='gini',random_state=rs)
model.fit(X_train, y_train)
print(accuracy_score(model.predict(X_train), y_train))
print(accuracy_score(model.predict(X_test), y_test))
0.9838709677419355
0.9444444444444444

ensembles¶

random forests (bagging)¶

In [81]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=100, random_state=rs).fit(X_train,y_train)

accuracy_score(model_rf.predict(X_test), y_test)
Out[81]:
0.9814814814814815

boosting (ada)¶

In [90]:
from sklearn.ensemble import AdaBoostClassifier
model_b = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1, criterion='entropy'),
                           n_estimators=1000, random_state=rs)
model_b.fit(X_train, y_train)
y_pred = model_b.predict(X_test)
accuracy_score(y_pred, y_test)
Out[90]:
0.9629629629629629

metrics¶

In [91]:
from sklearn import metrics
print(metrics.classification_report(y_pred, y_test))
              precision    recall  f1-score   support

           1       0.86      1.00      0.92        12
           2       1.00      0.90      0.95        20
           3       1.00      1.00      1.00        22

    accuracy                           0.96        54
   macro avg       0.95      0.97      0.96        54
weighted avg       0.97      0.96      0.96        54

In [95]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
No description has been provided for this image

log-reg¶

In [46]:
from sklearn.linear_model import LogisticRegression
logistic_mod = LogisticRegression(penalty='l2', solver='liblinear').fit(X_train, y_train)

print(f'Train Accuracy: {accuracy_score(logistic_mod.predict(X_train), y_train)}')
print(f'Test Accuracy: {accuracy_score(logistic_mod.predict(X_test), y_test)}')
Train Accuracy: 0.9758064516129032
Test Accuracy: 0.9444444444444444

svm¶

In [61]:
from sklearn import svm

svc_lin_mod = svm.SVC(kernel="linear").fit(X_train, y_train)
svc_rbf_mod = svm.SVC(kernel="rbf", gamma=0.7).fit(X_train, y_train)

print(f'Linear SVC Train Accuracy: {accuracy_score(svc_lin_mod.predict(X_train), y_train)}')
print(f'Linear SVC Test Accuracy: {accuracy_score(svc_lin_mod.predict(X_test), y_test)}')

print(f'Radial SVC Train Accuracy: {accuracy_score(svc_rbf_mod.predict(X_train), y_train)}')
print(f'Radial SVC Test Accuracy: {accuracy_score(svc_rbf_mod.predict(X_test), y_test)}')
Linear SVC Train Accuracy: 1.0
Linear SVC Test Accuracy: 0.9814814814814815
Radial SVC Train Accuracy: 1.0
Radial SVC Test Accuracy: 0.3333333333333333
In [53]:
svc_poly_mod = svm.SVC(kernel="poly", degree=1).fit(X_train, y_train)

print(f'Polynomial SVC Train Accuracy: {accuracy_score(svc_poly_mod.predict(X_train), y_train)}')
print(f'Polynomial SVC Test Accuracy: {accuracy_score(svc_poly_mod.predict(X_test), y_test)}')
Polynomial SVC Train Accuracy: 0.7258064516129032
Polynomial SVC Test Accuracy: 0.5555555555555556
In [57]:
accuracy_score(svm.LinearSVC(max_iter=10000).fit(X_train, y_train).predict(X_train), y_train)
Out[57]:
0.9919354838709677
In [62]:
accuracy_score(svm.LinearSVC(max_iter=10000).fit(X_train, y_train).predict(X_test), y_test)
Out[62]:
0.9259259259259259
In [ ]: