%load_ext autoreload %autoreload 2 %matplotlib inline from fastai.imports import * from fastai.structured import * import csv from pandas_summary import DataFrameSummary from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from IPython.display import display from sklearn import metrics df_raw = pd.read_csv('Bulldozer_train.csv', low_memory=False, parse_dates=["saledate"]) df_raw.SalePrice = np.log(df_raw.SalePrice) add_datepart(df_raw, 'YearBuilt') add_datepart(df_raw, 'YearRemodAdd') train_cats(df_raw) df, y, nas = proc_df(df_raw, 'SalePrice') def split_vals(a,n): return a[:n].copy(), a[n:].copy() n_valid = 140 # same as Kaggle's test set size n_trn = len(df)-n_valid raw_train, raw_valid = split_vals(df_raw, n_trn) X_train, X_valid = split_vals(df, n_trn) y_train, y_valid = split_vals(y, n_trn) X_train.shape, y_train.shape, X_valid.shape df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice', subset=200, na_dict=nas) X_train, _ = split_vals(df_trn, 150) y_train, _ = split_vals(y_trn, 150) m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True) m.fit(X_train, y_train) df_test = pd.read_csv('Housing_test.csv', low_memory=False, parse_dates=["YearBuilt", "YearRemodAdd"]) add_datepart(df_test, 'YearBuilt') add_datepart(df_test, 'YearRemodAdd') train_cats(df_test) df_1 = proc_df(df_test) y_test= m.predict(df_1)