In [19]:
X = np.array([[0,0],
              [0,1],
              [1,0],
              [1,1]])
y = np.array([0, 1, 1, 0])
import numpy as np
import matplotlib.pyplot as plt
plt.scatter(X[:,0], X[:,1], color=list(map(lambda f: 'blue' if f == 1 else 'red', y)))
plt.show()
[0 0 1 1]
No description has been provided for this image
In [25]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=2)
model.fit(X, y)
model.predict(X)

# Create a meshgrid for plotting decision boundaries
x_min, x_max = X[:,0].min() - 0.1, X[:,0].max() + 0.1
y_min, y_max = X[:,1].min() - 0.1, X[:,1].max() + 0.1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                     np.linspace(y_min, y_max, 200))

# Predict on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, levels=1, colors=['red', 'blue'])
plt.scatter(X[:,0], X[:,1], color=list(map(lambda f: 'blue' if f == 1 else 'red', y)))

plt.show()
No description has been provided for this image
In [44]:
from sklearn.tree import DecisionTreeClassifier
X3 = np.array([[0,0],
               [0,1],
               [1,0],
               [1,1],
               [2,0],
               [0,2],
               [2,1],
               [1,2],
               [2,2]])
y3 = np.array([0, 1, 1, 0, 0, 0, 1, 1, 0])

plt.scatter(X3[:,0], X3[:,1], color=list(map(lambda f: 'blue' if f == 1 else 'red', y3)))

model = DecisionTreeClassifier(max_depth=4,criterion='entropy')
model.fit(X3, y3)
model.predict(X3)

# Create a meshgrid for plotting decision boundaries
x_min, x_max = X3[:,0].min() - 0.1, X3[:,0].max() + 0.1
y_min, y_max = X3[:,1].min() - 0.1, X3[:,1].max() + 0.1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200),
                     np.linspace(y_min, y_max, 200))

# Predict on the grid
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision boundary
plt.contourf(xx, yy, Z, alpha=0.3, levels=1, colors=['red', 'blue'])

plt.show()
No description has been provided for this image
In [45]:
from sklearn import tree

fnames = X
tname = y
tree.plot_tree(model, feature_names=fnames, class_names=['red', 'blue'], filled=True)
plt.show()
No description has been provided for this image

note the redundancy above. identical subtrees constructed. great expressiveness, but inefficient.

furthermore, realise that decision trees are not good with k classes if each k class contains the same number of training samples. because if this is the case then there is no real way to put the higher entropy decisions at the top, all k classes would have the same (low) entropy, i.e. low uncertainty.

gini¶

is faster, formula contains only squares. better for high depth, many features.

entropy¶

contains log calculations. slower. works better on smaller datasets.

In [ ]: