Facebook
From Insensitive Owl, 3 Years ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 53
  1. import pandas as pd
  2. import numpy as np
  3. from itertools import combinations
  4. from itertools import permutations
  5.  
  6. from prompt_toolkit.layout import Dimension
  7.  
  8. # Data
  9. # DODAC wpisywanie min_sup !!!!
  10. min_sup = 0.25
  11. custID = [1, 1, 2, 2, 2, 3, 4, 4, 4, 5]
  12. dateOrder = [1, 2, 1, 2, 3, 1, 1, 2, 3, 1]
  13. items = [['a'], ['a'], ['a'], ['b'], ['c', 'e'], ['a', 'e'], ['a'], ['c', 'd', 'e'], ['a'], ['a']]
  14. singleItems = ['a', 'b', 'c', 'd', 'e']
  15. d = {'custID': custID, 'dateOrder': dateOrder, 'items': items}
  16. dataDF = pd.DataFrame(d)
  17.  
  18. dataDF = dataDF.set_index(pd.Index(items))
  19. print(dataDF)
  20.  
  21. groupedData = dataDF.groupby('custID').groups
  22. print(groupedData)
  23.  
  24.  
  25. def isSubset(subset, set):
  26.     if len(subset) > len(set):
  27.         return False
  28.     counter = 0
  29.     for sub_item in subset:
  30.         if sub_item in set:
  31.             counter += 1
  32.  
  33.     return counter == len(subset)
  34.  
  35.  
  36. def SupportCounter(pattern):
  37.     support = 0
  38.     for key in groupedData:
  39.         shouldOccure = len(pattern)
  40.         occurance = 0
  41.         for item in pattern:
  42.             occurance += groupedData[key].count(item)
  43.         if (occurance == shouldOccure):
  44.             support = support + 1
  45.         occurance = 0
  46.     return support
  47.  
  48.  
  49. def FindFrequentSetInDataSet(itemToCheckIfFrequent):
  50.     inHowManyGroupsItemOccured = 0
  51.     itemCounter = 0
  52.     for group in groupedData:
  53.         itemCounterPerGroup = 0
  54.         for items in groupedData[group]:
  55.             if isSubset(itemToCheckIfFrequent, items):
  56.                 itemCounter += 1
  57.                 itemCounterPerGroup += 1
  58.         if (itemCounterPerGroup > 0):
  59.             inHowManyGroupsItemOccured += 1
  60.     return inHowManyGroupsItemOccured
  61.  
  62.  
  63. for group in groupedData:
  64.     groupedData[group] = pd.array(groupedData[group])
  65.  
  66. L1 = []
  67. itemTable = singleItems[:]
  68. all_frequent_sets = []
  69. DSPower = len(groupedData)
  70. for item in singleItems:
  71.     occurancesInAllEvents = FindFrequentSetInDataSet(item)
  72.     if (occurancesInAllEvents / DSPower >= min_sup):
  73.         L1.append(item)
  74.         all_frequent_sets.append([item])
  75.  
  76. print("L1:", all_frequent_sets)
  77. flag = True
  78. counter_length = 2
  79. frequent_set = []
  80. for i in range(0, len(items)):
  81.     dlugosc = len(items[i])
  82.     j = 0
  83.     while j < dlugosc:
  84.         if items[i][j] not in L1:
  85.             items[i].pop(j)
  86.             dlugosc -= 1
  87.         j += 1
  88.  
  89. d2 = {'custID': custID, 'items': items}
  90. dataDF2 = pd.DataFrame(d2)
  91. newDFArray = []
  92. for index, row in dataDF2.iterrows():
  93.     emptyArrayCheck = row["items"]
  94.     if len(emptyArrayCheck) > 0:
  95.         newDFArray.append(row)
  96. newDF = pd.DataFrame(newDFArray)
  97. print(newDF)
  98.  
  99. while flag:
  100.  
  101.     current_candidates = combinations(L1, counter_length)
  102.     current_candidates = np.array(list(current_candidates))
  103.     for item in current_candidates:
  104.         occurrences = FindFrequentSetInDataSet(item)
  105.         if occurrences / DSPower >= min_sup:
  106.             frequent_set.append(item)
  107.             all_frequent_sets.append(list(item))
  108.     print("L" + str(counter_length) + ":", np.array(list(frequent_set)))
  109.     if len(frequent_set) == 0 or len(L1) - 1 <= counter_length:
  110.         break
  111.     frequent_set = []
  112.     counter_length += 1
  113.  
  114. frequent_dict = dict()
  115. number = 1
  116. for i in all_frequent_sets:
  117.     d = {number: i}
  118.     frequent_dict.update(d)
  119.     number += 1
  120.  
  121. columns = {'sets': all_frequent_sets, 'values': range(1, len(all_frequent_sets) + 1)}
  122. df_frequent = pd.DataFrame(columns)
  123. print("Odwzorowanie:")
  124. print(frequent_dict)
  125.  
  126. # ODWZOROWANIE
  127. tmpTable = []
  128. # for item in newDF['items']:
  129.  
  130. new_list = [[x] for x in newDF['items']]
  131.  
  132. for item in new_list:
  133.     if len(item[0]) > 1:
  134.         tmp = item[:]
  135.         z = [[x] for x in tmp[0]]
  136.         y = []
  137.         print(len(z))
  138.         print(all_frequent_sets)
  139.         z.append(item[0])
  140.         for i in range(0, len(z)):
  141.             if (z[i] in all_frequent_sets):
  142.                 y.append(z[i])
  143.  
  144.         print(y)
  145.  
  146.         item[0] = y[:]
  147.         print(item[0])
  148.         # x = combinations(item[0])
  149. print(new_list)
  150. for i in range(0, len(new_list)):
  151.     if len(new_list[i][0]) > 1:
  152.         new_list[i] = new_list[i][0]
  153.  
  154. print(new_list)
  155. print(df_frequent)
  156.  
  157. # for item in new_list:
  158. for i in range(0, len(new_list)):
  159.     for j in range(0, len(new_list[i])):
  160.         iterator = 0
  161.         for mySet in df_frequent["sets"]:
  162.             tmpValue = df_frequent.at[iterator, "values"]
  163.             # print(tmpValue)
  164.             if new_list[i][j] == mySet:
  165.                 new_list[i][j] = tmpValue
  166.                 break
  167.             iterator += 1
  168. print(new_list)
  169. # df_frequent = df_frequent.set_index('sets')
  170. # print(list(frequent_dict.keys()[frequent_dict.values().index(['a'])]))
  171.  
  172. newDF["items"] = new_list
  173. print(newDF)
  174.  
  175. newDF = newDF.set_index(pd.Index(newDF["items"]))
  176. groupedData2 = newDF.groupby('custID').groups
  177. print(groupedData2)
  178.  
  179. allCand = []
  180. for i in range(1, len(df_frequent["values"]) + 1):
  181.     for j in range(1, len(df_frequent["values"]) + 1):
  182.         tmp = [i, j]
  183.         allCand.append(tmp)
  184. print(allCand)
  185.  
  186. # (1,1), konkretny klient transakcje (1: [[1], [1]],).
  187. # wez pierwsze elem z (1,1) spr czy zawiera se w pierwszej trazakji jak tak  occur +=1 CONTINUE , jak nie Continue
  188.  
  189. frequent_formatted = []
  190. for cand in allCand:
  191.     occurances = 0
  192.     for i in range(1, max(newDF["custID"]) + 1):
  193.         semi_occurance = 0
  194.         counter = 0
  195.         for tran in groupedData2[i]:
  196.             if cand[counter] in tran:
  197.                 semi_occurance += 1
  198.                 counter += 1
  199.                 if counter > 1:
  200.                     break
  201.         if semi_occurance > 1:
  202.             occurances += 1
  203.     support = occurances/max(newDF["custID"])
  204.     if support > min_sup:
  205.         frequent_set.append((cand , support))
  206.  
  207. for i in frequent_set:
  208.     print(i)