Facebook
From Sloppy Partdridge, 5 Years ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 187
  1. import numpy as py
  2. import pandas as pd
  3. import io
  4. import os
  5. import requests
  6. import seaborn as sns
  7. from matplotlib import pyplot as plt
  8. import pickle
  9. from pandas.api.types import CategoricalDtype
  10. from sklearn.base import BaseEstimator, TransformerMixin
  11. from sklearn.pipeline import Pipeline
  12. import sklearn
  13. from sklearn.metrics import accuracy_score
  14. from sklearn.linear_model import LogisticRegression
  15. from sklearn.model_selection import GridSearchCV
  16. from sklearn.metrics import classification_report
  17. from sklearn.metrics import confusion_matrix
  18. from sklearn.feature_extraction import DictVectorizer
  19. from sklearn.preprocessing import StandardScaler
  20. from sklearn.preprocessing import OneHotEncoder
  21. from sklearn.pipeline import FeatureUnion
  22. from sklearn.model_selection import cross_val_score
  23.  
  24. urls = ['http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
  25.         'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names',
  26.         'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test']
  27.  
  28.  
  29. def load_dataset(path, urls):
  30.     if not os.path.exists(path):
  31.         os.mkdir(path)
  32.     for url in urls:
  33.         data = requests.get(url).content
  34.         filename = os.path.join(path, os.path.basename(url))
  35.         with open(filename, "wb") as file:
  36.             file.write(data)
  37.  
  38.  
  39. ##load_dataset('dane', urls)##
  40.  
  41. # Zadanie 1 #0
  42. columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
  43.            'marital-status', 'occupation', 'relationship', 'race',
  44.            'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
  45.  
  46. adultData = pd.read_csv('dane/adult.data', skipinitialspace=True, names=columns, na_values="?")
  47. adultTest = pd.read_csv('dane/adult.test', skipinitialspace=True, names=columns, skiprows=1, na_values="?")
  48.  
  49. DFdata = pd.DataFrame(adultData)
  50. DFtest = pd.DataFrame(adultTest)
  51.  
  52. DFtest.income = DFtest.income.map(lambda x: str(x)[:-1])
  53.  
  54. print('Pierwsze 5 wierszy adult.data:')
  55. print(DFdata.head(5))
  56. print('Ostatnie 5 wierszy adult.data:')
  57. print(DFdata.tail(5))
  58. print('Pierwsze 5 wierszy adult.test:')
  59. print(DFtest.head(5))
  60. print('Ostatnie 5 wierszy adult.test:')
  61. print(DFtest.tail(5))
  62.  
  63. # Zadanie 2 #
  64.  
  65. DFdata.info()
  66. ##Próbek w adult.data = 32561##
  67.  
  68. DFtest.info()
  69. ##Próbek w adult.test = 16281##
  70.  
  71. ##Kolumny z danymi numerycznymi = fnlwgt, education-num, capital-gain, capital-loss, hours-per-week##
  72. ##Kolumny z pustymi wartościami: workclass, occupation, native-country ##
  73.  
  74. # Zadanie 3 #
  75.  
  76. print(DFdata.select_dtypes(include=['int64']).describe())
  77. print(DFtest.select_dtypes(include=['int64']).describe())
  78.  
  79. DFdata.hist()
  80. # plt.show()
  81. DFtest.hist()
  82. # plt.show()
  83.  
  84. # Zadanie 4 #
  85.  
  86. for x in list(DFdata.select_dtypes(exclude=['int64'])):
  87.     ax = sns.countplot(y=x, hue="income", data=DFdata.select_dtypes(exclude=['int64']))
  88.     # plt.show()
  89.  
  90. for x in list(DFtest.select_dtypes(exclude=['int64'])):
  91.     ax = sns.countplot(y=x, hue="income", data=DFtest.select_dtypes(exclude=['int64']))
  92.  
  93.  
  94. #     plt.show()
  95.  
  96. # Do uzupełnienia brakujących danych można użyć funkcji fillna() #
  97.  
  98.  
  99. # Zadanie 5 #
  100.  
  101. class ColumnsSelector(TransformerMixin, BaseEstimator):
  102.     def __init__(self, datatype):
  103.         self.datatype = 'int64'
  104.  
  105.     def transform(self, X):
  106.         X_COPY = X.copy()
  107.         return X_COPY.select_dtypes(include=[self.datatype])
  108.  
  109.     def fit(self, *_):
  110.         return self
  111.  
  112.  
  113. columnsSelector = ColumnsSelector('int64')
  114. print(columnsSelector.transform(DFdata).head())
  115.  
  116. # Zadanie 6 #
  117. scaler= StandardScaler()
  118. pipelineInt = Pipeline([('cloumnsType', ColumnsSelector('int64')), ('scaler', scaler)])
  119. # Zadanie 7 #
  120.  
  121. class MostFrequentImputer(TransformerMixin, BaseEstimator):
  122.     def __init__(self, columns=None):
  123.         self.columns = columns
  124.  
  125.     def fit(self, X, y=None):
  126.         if self.columns is None:
  127.             self.columns = X.columns
  128.         self.fill = {column: X[column].value_counts().index[0]
  129.                      for column in self.columns}
  130.         return self
  131.  
  132.     def transform(self, X):
  133.         X_copy = X.copy()
  134.         for column in self.columns:
  135.             X_copy[column] = X_copy[column].fillna(self.fill[column])
  136.         return X_copy
  137.  
  138. class PandasToDict(TransformerMixin, BaseEstimator):
  139.         def fit (self,y=None):
  140.             return self
  141.         def transform(self,X):
  142.             return X.to_dict(orient='records')
  143.  
  144.  
  145. pipelineStr = Pipeline([
  146.     ("columnSelector", ColumnsSelector('object')),
  147.     ("mostFrequentImporter", MostFrequentImputer()),
  148.     ("pandasToDict", PandasToDict()),
  149.     ("dictVectorizer", DictVectorizer())
  150. ])
  151.  
  152.  
  153. # Zadanie 8 #
  154.  
  155. union = FeatureUnion([("pipelineStr", pipelineStr), ("pipelineInt", pipelineInt)])
  156.  
  157. # Zadanie 9 #
  158.  
  159.  
  160. adultData = adultData.fillna(method="backfill")
  161. macierz = adultData.drop(["fnlwgt", "education", "income"], axis=1)
  162.  
  163. macierz = union.fit_transform(macierz)
  164.  
  165.  
  166. model = LogisticRegression().fit(macierz, adultData.income)
  167. filename = 'lr_model.pkl'
  168. pickle.dump(model, open(filename, 'wb'))
  169.  
  170.  
  171.  
  172. # LAB 4 #
  173.  
  174. # Zadanie 1 #
  175.  
  176. saved_model = pickle.load(open(filename, 'rb'))
  177. print(saved_model)
  178.  
  179. # Zadanie 2 #
  180.  
  181.