import numpy as py import pandas as pd import io import os import requests import seaborn as sns from matplotlib import pyplot as plt import pickle from pandas.api.types import CategoricalDtype from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline import sklearn from sklearn.metrics import accuracy_score from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import OneHotEncoder from sklearn.pipeline import FeatureUnion from sklearn.model_selection import cross_val_score urls = ['http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names', 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'] def load_dataset(path, urls): if not os.path.exists(path): os.mkdir(path) for url in urls: data = requests.get(url).content filename = os.path.join(path, os.path.basename(url)) with open(filename, "wb") as file: file.write(data) ##load_dataset('dane', urls)## # Zadanie 1 #0 columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'] adultData = pd.read_csv('dane/adult.data', skipinitialspace=True, names=columns, na_values="?") adultTest = pd.read_csv('dane/adult.test', skipinitialspace=True, names=columns, skiprows=1, na_values="?") DFdata = pd.DataFrame(adultData) DFtest = pd.DataFrame(adultTest) DFtest.income = DFtest.income.map(lambda x: str(x)[:-1]) print('Pierwsze 5 wierszy adult.data:') print(DFdata.head(5)) print('Ostatnie 5 wierszy adult.data:') print(DFdata.tail(5)) print('Pierwsze 5 wierszy adult.test:') print(DFtest.head(5)) print('Ostatnie 5 wierszy adult.test:') print(DFtest.tail(5)) # Zadanie 2 # DFdata.info() ##Próbek w adult.data = 32561## DFtest.info() ##Próbek w adult.test = 16281## ##Kolumny z danymi numerycznymi = fnlwgt, education-num, capital-gain, capital-loss, hours-per-week## ##Kolumny z pustymi wartościami: workclass, occupation, native-country ## # Zadanie 3 # print(DFdata.select_dtypes(include=['int64']).describe()) print(DFtest.select_dtypes(include=['int64']).describe()) DFdata.hist() # plt.show() DFtest.hist() # plt.show() # Zadanie 4 # for x in list(DFdata.select_dtypes(exclude=['int64'])): ax = sns.countplot(y=x, hue="income", data=DFdata.select_dtypes(exclude=['int64'])) # plt.show() for x in list(DFtest.select_dtypes(exclude=['int64'])): ax = sns.countplot(y=x, hue="income", data=DFtest.select_dtypes(exclude=['int64'])) # plt.show() # Do uzupełnienia brakujących danych można użyć funkcji fillna() # # Zadanie 5 # class ColumnsSelector(TransformerMixin, BaseEstimator): def __init__(self, datatype): self.datatype = 'int64' def transform(self, X): X_COPY = X.copy() return X_COPY.select_dtypes(include=[self.datatype]) def fit(self, *_): return self columnsSelector = ColumnsSelector('int64') print(columnsSelector.transform(DFdata).head()) # Zadanie 6 # scaler= StandardScaler() pipelineInt = Pipeline([('cloumnsType', ColumnsSelector('int64')), ('scaler', scaler)]) # Zadanie 7 # class MostFrequentImputer(TransformerMixin, BaseEstimator): def __init__(self, columns=None): self.columns = columns def fit(self, X, y=None): if self.columns is None: self.columns = X.columns self.fill = {column: X[column].value_counts().index[0] for column in self.columns} return self def transform(self, X): X_copy = X.copy() for column in self.columns: X_copy[column] = X_copy[column].fillna(self.fill[column]) return X_copy class PandasToDict(TransformerMixin, BaseEstimator): def fit (self,y=None): return self def transform(self,X): return X.to_dict(orient='records') pipelineStr = Pipeline([ ("columnSelector", ColumnsSelector('object')), ("mostFrequentImporter", MostFrequentImputer()), ("pandasToDict", PandasToDict()), ("dictVectorizer", DictVectorizer()) ]) # Zadanie 8 # union = FeatureUnion([("pipelineStr", pipelineStr), ("pipelineInt", pipelineInt)]) # Zadanie 9 # adultData = adultData.fillna(method="backfill") macierz = adultData.drop(["fnlwgt", "education", "income"], axis=1) macierz = union.fit_transform(macierz) model = LogisticRegression().fit(macierz, adultData.income) filename = 'lr_model.pkl' pickle.dump(model, open(filename, 'wb')) # LAB 4 # # Zadanie 1 # saved_model = pickle.load(open(filename, 'rb')) print(saved_model) # Zadanie 2 #