import pandas as pd
import numpy as np
Using petrol consumption: https://www.kaggle.com/arpikr/petrol-consumption
df = pd.read_csv('petrol_consumption.csv')
df.head()
df.shape
df.dtypes
X = df.drop('Petrol_Consumption', axis=1)
y = df['Petrol_Consumption']
print(X.shape, y.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.10, random_state=0)
print(X_train.shape, X_test.shape)
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
pd.DataFrame({
'Actual': y_test,
'Predicted': y_pred,
'Diff': y_test - y_pred
})
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred))
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(regressor, out_file=dot_data,
feature_names=X.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())