%load_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
import csv
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
df_raw = pd.read_csv('Bulldozer_train.csv', low_memory=False, parse_dates=["saledate"])
df_raw.SalePrice = np.log(df_raw.SalePrice)
add_datepart(df_raw, 'YearBuilt')
add_datepart(df_raw, 'YearRemodAdd')
train_cats(df_raw)
df, y, nas = proc_df(df_raw, 'SalePrice')
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 140 # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice', subset=200, na_dict=nas)
X_train, _ = split_vals(df_trn, 150)
y_train, _ = split_vals(y_trn, 150)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
df_test = pd.read_csv('Housing_test.csv', low_memory=False, parse_dates=["YearBuilt", "YearRemodAdd"])
add_datepart(df_test, 'YearBuilt')
add_datepart(df_test, 'YearRemodAdd')
train_cats(df_test)
df_1 = proc_df(df_test)
y_test= m.predict(df_1)