- import numpy as py
- import pandas as pd
- import io
- import os
- import requests
- import seaborn as sns
- from matplotlib import pyplot as plt
- import pickle
- from pandas.api.types import CategoricalDtype
- from sklearn.base import BaseEstimator, TransformerMixin
- from sklearn.pipeline import Pipeline
- import sklearn
- from sklearn.metrics import accuracy_score
- from sklearn.linear_model import LogisticRegression
- from sklearn.model_selection import GridSearchCV
- from sklearn.metrics import classification_report
- from sklearn.metrics import confusion_matrix
- from sklearn.feature_extraction import DictVectorizer
- from sklearn.preprocessing import StandardScaler
- from sklearn.preprocessing import OneHotEncoder
- from sklearn.pipeline import FeatureUnion
- from sklearn.model_selection import cross_val_score
- urls = ['http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
- 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names',
- 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test']
- def load_dataset(path, urls):
- if not os.path.exists(path):
- os.mkdir(path)
- for url in urls:
- data = requests.get(url).content
- filename = os.path.join(path, os.path.basename(url))
- with open(filename, "wb") as file:
- file.write(data)
- ##load_dataset('dane', urls)##
- # Zadanie 1 #0
- columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
- 'marital-status', 'occupation', 'relationship', 'race',
- 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
- adultData = pd.read_csv('dane/adult.data', skipinitialspace=True, names=columns, na_values="?")
- adultTest = pd.read_csv('dane/adult.test', skipinitialspace=True, names=columns, skiprows=1, na_values="?")
- DFdata = pd.DataFrame(adultData)
- DFtest = pd.DataFrame(adultTest)
- DFtest.income = DFtest.income.map(lambda x: str(x)[:-1])
- print('Pierwsze 5 wierszy adult.data:')
- print(DFdata.head(5))
- print('Ostatnie 5 wierszy adult.data:')
- print(DFdata.tail(5))
- print('Pierwsze 5 wierszy adult.test:')
- print(DFtest.head(5))
- print('Ostatnie 5 wierszy adult.test:')
- print(DFtest.tail(5))
- # Zadanie 2 #
- DFdata.info()
- ##Próbek w adult.data = 32561##
- DFtest.info()
- ##Próbek w adult.test = 16281##
- ##Kolumny z danymi numerycznymi = fnlwgt, education-num, capital-gain, capital-loss, hours-per-week##
- ##Kolumny z pustymi wartościami: workclass, occupation, native-country ##
- # Zadanie 3 #
- print(DFdata.select_dtypes(include=['int64']).describe())
- print(DFtest.select_dtypes(include=['int64']).describe())
- DFdata.hist()
- # plt.show()
- DFtest.hist()
- # plt.show()
- # Zadanie 4 #
- for x in list(DFdata.select_dtypes(exclude=['int64'])):
- ax = sns.countplot(y=x, hue="income", data=DFdata.select_dtypes(exclude=['int64']))
- # plt.show()
- for x in list(DFtest.select_dtypes(exclude=['int64'])):
- ax = sns.countplot(y=x, hue="income", data=DFtest.select_dtypes(exclude=['int64']))
- # plt.show()
- # Do uzupełnienia brakujących danych można użyć funkcji fillna() #
- # Zadanie 5 #
- class ColumnsSelector(TransformerMixin, BaseEstimator):
- def __init__(self, datatype):
- self.datatype = 'int64'
- def transform(self, X):
- X_COPY = X.copy()
- return X_COPY.select_dtypes(include=[self.datatype])
- def fit(self, *_):
- return self
- columnsSelector = ColumnsSelector('int64')
- print(columnsSelector.transform(DFdata).head())
- # Zadanie 6 #
- scaler= StandardScaler()
- pipelineInt = Pipeline([('cloumnsType', ColumnsSelector('int64')), ('scaler', scaler)])
- # Zadanie 7 #
- class MostFrequentImputer(TransformerMixin, BaseEstimator):
- def __init__(self, columns=None):
- self.columns = columns
- def fit(self, X, y=None):
- if self.columns is None:
- self.columns = X.columns
- self.fill = {column: X[column].value_counts().index[0]
- for column in self.columns}
- return self
- def transform(self, X):
- X_copy = X.copy()
- for column in self.columns:
- X_copy[column] = X_copy[column].fillna(self.fill[column])
- return X_copy
- class PandasToDict(TransformerMixin, BaseEstimator):
- def fit (self,y=None):
- return self
- def transform(self,X):
- return X.to_dict(orient='records')
- pipelineStr = Pipeline([
- ("columnSelector", ColumnsSelector('object')),
- ("mostFrequentImporter", MostFrequentImputer()),
- ("pandasToDict", PandasToDict()),
- ("dictVectorizer", DictVectorizer())
- ])
- # Zadanie 8 #
- union = FeatureUnion([("pipelineStr", pipelineStr), ("pipelineInt", pipelineInt)])
- # Zadanie 9 #
- adultData = adultData.fillna(method="backfill")
- macierz = adultData.drop(["fnlwgt", "education", "income"], axis=1)
- macierz = union.fit_transform(macierz)
- model = LogisticRegression().fit(macierz, adultData.income)
- filename = 'lr_model.pkl'
- pickle.dump(model, open(filename, 'wb'))
- # LAB 4 #
- # Zadanie 1 #
- saved_model = pickle.load(open(filename, 'rb'))
- print(saved_model)
- # Zadanie 2 #