import pandas as pd
import numpy as np
df = pd.read_csv('PastHires.csv')
df
df.shape
df.dtypes
for col in df.select_dtypes('object').columns:
df[col] = df[col].astype('category').cat.codes
df
len(df)
df.dtypes
from sklearn import tree
X = df.drop('Hired', axis=1)
y = df['Hired']
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,y)
print(df.loc[2])
clf.predict(X.loc[[2]])
from IPython.display import Image
from sklearn.externals.six import StringIO
import pydotplus
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data,
feature_names=X.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
TARGET_NAME = 'Hired'
true_rows = df[df['Interned'] == 1]
false_rows = df[df['Interned'] == 0]
print(len(true_rows), true_rows[TARGET_NAME].value_counts().to_dict())
print(len(false_rows), false_rows[TARGET_NAME].value_counts().to_dict())
print(len(df))
print('gini true_rows:', 1 - (0/5)**2 - (5/5)**2)
print('gini false_rows:', 1 - (4/8)**2 - (4/8)**2)
def gini(X):
res = 1
n = len(X)
for c, count in X[TARGET_NAME].value_counts().iteritems():
res -= (count/n)**2
return res
print('gini true_rows:', gini(true_rows))
print('gini false_rows:', gini(false_rows))
def gini_index(true_rows, false_rows):
n_true = len(true_rows)
n_false = len(false_rows)
res = 0
res += gini(true_rows) * n_true
res += gini(false_rows) * n_false
return res/(n_true+n_false)
gini_index(true_rows, false_rows)
def partition(df, b):
return df[b], df[~b]
true_rows, false_rows = partition(df, df['Interned'] == 1)
print(true_rows[TARGET_NAME].value_counts().to_dict())
print(false_rows[TARGET_NAME].value_counts().to_dict())
def find_best_split(rows, verbose=False):
best_gain = 0
data = {
'samples': len(rows),
'classes': rows[TARGET_NAME].value_counts().to_dict()
}
# We reached a leaf
if len(data['classes']) < 2:
return 0, data
# Loop columns except target
for col in rows.columns:
if col == TARGET_NAME:
continue
values = rows[col].unique()
is_numeric = np.issubdtype(rows[col].dtype, np.number)
# Loop values of column
for val in values:
# Partition in two groups (val and !val)
true_rows, false_rows = partition(
rows,
rows[col] <= val if is_numeric
else rows[col] == val)
# Compute information gain
gain = 1 - gini_index(true_rows, false_rows)
if len(true_rows) == 0 or len(false_rows) == 0:
continue
if verbose:
print('{col:s}:{val:s} {pad:s} {gain:.3f}'.format(
col=col,
val=str(val),
pad=' '*(20-len(col)-len(str(val))),
gain=gain
))
# Keep track of the best split
if gain >= best_gain:
best_gain = gain
data['column'] = col
data['value'] = val
data['is_numeric'] = is_numeric
return best_gain, data
gain, data = find_best_split(df, verbose=True)
print('\nBest split:')
pd.Series(data)
gain, data = find_best_split(df[df['Interned'] > 0])
print('\nBest split:')
pd.Series(data)
def build_tree(rows):
gain, data = find_best_split(rows)
if gain == 0:
return data
column = data['column']
value = data['value']
true_rows, false_rows = partition(
rows,
rows[column] <= value if data['is_numeric']
else rows[column] == value
)
return [
data,
build_tree(true_rows),
build_tree(false_rows)
]
tree = build_tree(df)
tree
classes = sorted(df[TARGET_NAME].unique())
classes
def print_tree(tree, indent=0):
if isinstance(tree, list):
data = tree[0]
print('{:s}| {:s}:{:s}'.format(
'-'*indent,
data['column'],
str(data['value'])
))
print_tree(tree[1], indent+4)
print_tree(tree[2], indent+4)
else:
count = tree['classes']
print('{:s}| {:s}'.format(
'-'*indent,
str([count.get(c,0) for c in classes])
))
print_tree(tree)
def predict(tree, x):
if isinstance(tree, list):
data = tree[0]
value = x[data['column']]
ref = data['value']
b = value <= ref if data['is_numeric'] \
else value == ref
return predict(tree[(1 if b else 2)], x)
else:
return max(tree['classes'], key=tree['classes'].get)
predict(tree, {'Interned':1})
print(df.loc[2])
predict(tree, df.loc[2])