import math import sklearn import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from sklearn import metrics from sklearn.datasets import load_boston from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split # Zad 1 print("Zad 1") boston_dataset = load_boston() data_frame = pd.DataFrame(data=boston_dataset['data'], columns=boston_dataset['feature_names']) data_frame['MEDV'] = pd.Series(boston_dataset['target']) print(data_frame.head(10)) print(data_frame.tail(10)) # Zad 2 print("Zad 2") data_frame.info() # a - 506 # b - float64 # c - nie # Zad 3 print("Zad 3") describe = data_frame.describe() print(describe) # a - sredni=3.593761 std=8.596783 # b - max=50.000000 min=5.000000 # c - 12.653063 # Zad 4 print("Zad 4") sns.distplot(data_frame.MEDV) plt.show() # Zad 5 print("Zad 5") corr_matrix = data_frame.corr().round(2) sns.heatmap(corr_matrix, annot=True) # a - RM, ZN, B # b - LSTAT # c - TAX do RAD oraz same dla siebie - 1 sns.lmplot('MEDV', 'RM', data=corr_matrix) sns.lmplot('MEDV', 'LSTAT', data=corr_matrix) plt.show() # Zad 6 print("Zad 6") x = data_frame[['RM', 'B', 'ZN']] y = data_frame[['MEDV']] X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2) # Zad 7 print("Zad 7") lin = LinearRegression() lin.fit(X_train, Y_train) Y_pred = lin.predict(X_test) Y_pred_ = lin.predict(X_train) plt.scatter(Y_test, Y_pred) plt.title('testowy') plt.show() plt.title('treningowy') plt.scatter(Y_train, Y_pred_) plt.show() # Zad 8 print("Zad 8") print('treningowy') print('RMSE: {}'.format(math.sqrt(metrics.mean_squared_error(Y_train, Y_pred_)))) print('MAE: {}'.format(metrics.mean_absolute_error(Y_train, Y_pred_))) print('testowy') print('RMSE: {}'.format(math.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))) print('MAE: {}'.format(metrics.mean_absolute_error(Y_test, Y_pred)))