Facebook
From Scribby Tamarin, 4 Years ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 52
  1. import pandas as pd
  2. import numpy as np
  3. from itertools import combinations
  4. from itertools import permutations
  5.  
  6. from prompt_toolkit.layout import Dimension
  7.  
  8. # Data
  9. # DODAC wpisywanie min_sup !!!!
  10. min_sup = 0.25
  11. custID = [1, 1, 2, 2, 2, 3, 4, 4, 4, 5]
  12. dateOrder = [1, 2, 1, 2, 3, 1, 1, 2, 3, 1]
  13. items = [['a'], ['a'], ['a'], ['b'], ['c', 'e'], ['a', 'e'], ['a'], ['c', 'd', 'e'], ['a'], ['a']]
  14. singleItems = ['a', 'b', 'c', 'd', 'e']
  15. d = {'custID': custID, 'dateOrder': dateOrder, 'items': items}
  16. dataDF = pd.DataFrame(d)
  17.  
  18. print(dataDF)
  19. dataDF = dataDF.set_index(pd.Index(items))
  20.  
  21. groupedData = dataDF.groupby('custID').groups
  22.  
  23. def isSubset(subset, set):
  24.     if len(subset) > len(set):
  25.         return False
  26.     counter = 0
  27.     for sub_item in subset:
  28.         if sub_item in set:
  29.             counter += 1
  30.  
  31.     return counter == len(subset)
  32.  
  33.  
  34. def SupportCounter(pattern):
  35.     support = 0
  36.     for key in groupedData:
  37.         shouldOccure = len(pattern)
  38.         occurance = 0
  39.         for item in pattern:
  40.             occurance += groupedData[key].count(item)
  41.         if (occurance == shouldOccure):
  42.             support = support + 1
  43.         occurance = 0
  44.     return support
  45.  
  46.  
  47. def FindFrequentSetInDataSet(itemToCheckIfFrequent):
  48.     inHowManyGroupsItemOccured = 0
  49.     itemCounter = 0
  50.     for group in groupedData:
  51.         itemCounterPerGroup = 0
  52.         for items in groupedData[group]:
  53.             if isSubset(itemToCheckIfFrequent, items):
  54.                 itemCounter += 1
  55.                 itemCounterPerGroup += 1
  56.         if (itemCounterPerGroup > 0):
  57.             inHowManyGroupsItemOccured += 1
  58.     return inHowManyGroupsItemOccured
  59.  
  60.  
  61. for group in groupedData:
  62.     groupedData[group] = pd.array(groupedData[group])
  63.  
  64. L1 = []
  65. itemTable = singleItems[:]
  66. all_frequent_sets = []
  67. DSPower = len(groupedData)
  68. for item in singleItems:
  69.     occurancesInAllEvents = FindFrequentSetInDataSet(item)
  70.     if (occurancesInAllEvents / DSPower >= min_sup):
  71.         L1.append(item)
  72.         all_frequent_sets.append([item])
  73.  
  74. print("L1:", all_frequent_sets)
  75. flag = True
  76. counter_length = 2
  77. frequent_set = []
  78. for i in range(0, len(items)):
  79.     dlugosc = len(items[i])
  80.     j = 0
  81.     while j < dlugosc:
  82.         if items[i][j] not in L1:
  83.             items[i].pop(j)
  84.             dlugosc -= 1
  85.         j += 1
  86.  
  87. d2 = {'custID': custID, 'items': items}
  88. dataDF2 = pd.DataFrame(d2)
  89. newDFArray = []
  90. for index, row in dataDF2.iterrows():
  91.     emptyArrayCheck = row["items"]
  92.     if len(emptyArrayCheck) > 0:
  93.         newDFArray.append(row)
  94. newDF = pd.DataFrame(newDFArray)
  95.  
  96. while flag:
  97.  
  98.     current_candidates = combinations(L1, counter_length)
  99.     current_candidates = np.array(list(current_candidates))
  100.     for item in current_candidates:
  101.         occurrences = FindFrequentSetInDataSet(item)
  102.         if occurrences / DSPower >= min_sup:
  103.             frequent_set.append(item)
  104.             all_frequent_sets.append(list(item))
  105.     print("L" + str(counter_length) + ":", np.array(list(frequent_set)))
  106.     if len(frequent_set) == 0 or len(L1) - 1 <= counter_length:
  107.         break
  108.     frequent_set = []
  109.     counter_length += 1
  110.  
  111. frequent_dict = dict()
  112. number = 1
  113. for i in all_frequent_sets:
  114.     d = {number: i}
  115.     frequent_dict.update(d)
  116.     number += 1
  117.  
  118. columns = {'sets': all_frequent_sets, 'values': range(1, len(all_frequent_sets) + 1)}
  119. df_frequent = pd.DataFrame(columns)
  120. print("Odwzorowanie:")
  121. print(df_frequent)
  122.  
  123. tmpTable = []
  124. new_list = [[x] for x in newDF['items']]
  125.  
  126. for item in new_list:
  127.     if len(item[0]) > 1:
  128.         tmp = item[:]
  129.         z = [[x] for x in tmp[0]]
  130.         y = []
  131.         z.append(item[0])
  132.         for i in range(0, len(z)):
  133.             if (z[i] in all_frequent_sets):
  134.                 y.append(z[i])
  135.  
  136.         item[0] = y[:]
  137.  
  138. for i in range(0, len(new_list)):
  139.     if len(new_list[i][0]) > 1:
  140.         new_list[i] = new_list[i][0]
  141.  
  142.  
  143. for i in range(0, len(new_list)):
  144.     for j in range(0, len(new_list[i])):
  145.         iterator = 0
  146.         for mySet in df_frequent["sets"]:
  147.             tmpValue = df_frequent.at[iterator, "values"]
  148.             if new_list[i][j] == mySet:
  149.                 new_list[i][j] = tmpValue
  150.                 break
  151.             iterator += 1
  152.  
  153. newDF["items"] = new_list
  154. print("Po Transfromacji i odwzorowaniu:")
  155. print(newDF)
  156.  
  157. newDF = newDF.set_index(pd.Index(newDF["items"]))
  158. groupedData2 = newDF.groupby('custID').groups
  159.  
  160. allCand = []
  161. for i in range(1, len(df_frequent["values"]) + 1):
  162.     for j in range(1, len(df_frequent["values"]) + 1):
  163.         tmp = [i, j]
  164.         allCand.append(tmp)
  165.  
  166.  
  167. frequent_formatted = []
  168. for cand in allCand:
  169.     occurances = 0
  170.     for i in range(1, max(newDF["custID"]) + 1):
  171.         semi_occurance = 0
  172.         counter = 0
  173.         for tran in groupedData2[i]:
  174.             if cand[counter] in tran:
  175.                 semi_occurance += 1
  176.                 counter += 1
  177.                 if counter > 1:
  178.                     break
  179.         if semi_occurance > 1:
  180.             occurances += 1
  181.     support = occurances/max(newDF["custID"])
  182.     if support > min_sup:
  183.         frequent_formatted.append((cand , support))
  184.  
  185. for i in frequent_formatted:
  186.     print(i)