In [19]:
X = np.array([[0,0],
[0,1],
[1,0],
[1,1]])
y = np.array([0, 1, 1, 0])
import numpy as np
import matplotlib.pyplot as plt
plt.scatter(X[:,0], X[:,1], color=list(map(lambda f: 'blue' if f == 1 else 'red', y)))
plt.show()
[0 0 1 1]
In [25]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=2)
model.fit(X, y)
model.predict(X)
# Create a meshgrid for plotting decision boundaries
x_min, x_max = X[:,0].min() - 0.1, X[:,0].max() + 0.1
y_min, y_max = X[:,1].min() - 0.1, X[:,1].max() + 0.1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
np.linspace(y_min, y_max, 200))
# Predict on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, levels=1, colors=['red', 'blue'])
plt.scatter(X[:,0], X[:,1], color=list(map(lambda f: 'blue' if f == 1 else 'red', y)))
plt.show()
In [44]:
from sklearn.tree import DecisionTreeClassifier
X3 = np.array([[0,0],
[0,1],
[1,0],
[1,1],
[2,0],
[0,2],
[2,1],
[1,2],
[2,2]])
y3 = np.array([0, 1, 1, 0, 0, 0, 1, 1, 0])
plt.scatter(X3[:,0], X3[:,1], color=list(map(lambda f: 'blue' if f == 1 else 'red', y3)))
model = DecisionTreeClassifier(max_depth=4,criterion='entropy')
model.fit(X3, y3)
model.predict(X3)
# Create a meshgrid for plotting decision boundaries
x_min, x_max = X3[:,0].min() - 0.1, X3[:,0].max() + 0.1
y_min, y_max = X3[:,1].min() - 0.1, X3[:,1].max() + 0.1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
np.linspace(y_min, y_max, 200))
# Predict on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, levels=1, colors=['red', 'blue'])
plt.show()
In [45]:
from sklearn import tree
fnames = X
tname = y
tree.plot_tree(model, feature_names=fnames, class_names=['red', 'blue'], filled=True)
plt.show()
note the redundancy above. identical subtrees constructed. great expressiveness, but inefficient.
furthermore, realise that decision trees are not good with k classes if each k class contains the same number of training samples. because if this is the case then there is no real way to put the higher entropy decisions at the top, all k classes would have the same (low) entropy, i.e. low uncertainty.
gini¶
is faster, formula contains only squares. better for high depth, many features.
entropy¶
contains log calculations. slower. works better on smaller datasets.
In [ ]: