- import pandas as pd
- import numpy as np
- from itertools import combinations
- from itertools import permutations
- from prompt_toolkit.layout import Dimension
- # Data
- # DODAC wpisywanie min_sup !!!!
- min_sup = 0.25
- custID = [1, 1, 2, 2, 2, 3, 4, 4, 4, 5]
- dateOrder = [1, 2, 1, 2, 3, 1, 1, 2, 3, 1]
- items = [['a'], ['a'], ['a'], ['b'], ['c', 'e'], ['a', 'e'], ['a'], ['c', 'd', 'e'], ['a'], ['a']]
- singleItems = ['a', 'b', 'c', 'd', 'e']
- d = {'custID': custID, 'dateOrder': dateOrder, 'items': items}
- dataDF = pd.DataFrame(d)
- dataDF = dataDF.set_index(pd.Index(items))
- print(dataDF)
- groupedData = dataDF.groupby('custID').groups
- print(groupedData)
- def isSubset(subset, set):
- if len(subset) > len(set):
- return False
- counter = 0
- for sub_item in subset:
- if sub_item in set:
- counter += 1
- return counter == len(subset)
- def SupportCounter(pattern):
- support = 0
- for key in groupedData:
- shouldOccure = len(pattern)
- occurance = 0
- for item in pattern:
- occurance += groupedData[key].count(item)
- if (occurance == shouldOccure):
- support = support + 1
- occurance = 0
- return support
- def FindFrequentSetInDataSet(itemToCheckIfFrequent):
- inHowManyGroupsItemOccured = 0
- itemCounter = 0
- for group in groupedData:
- itemCounterPerGroup = 0
- for items in groupedData[group]:
- if isSubset(itemToCheckIfFrequent, items):
- itemCounter += 1
- itemCounterPerGroup += 1
- if (itemCounterPerGroup > 0):
- inHowManyGroupsItemOccured += 1
- return inHowManyGroupsItemOccured
- for group in groupedData:
- groupedData[group] = pd.array(groupedData[group])
- L1 = []
- itemTable = singleItems[:]
- all_frequent_sets = []
- DSPower = len(groupedData)
- for item in singleItems:
- occurancesInAllEvents = FindFrequentSetInDataSet(item)
- if (occurancesInAllEvents / DSPower >= min_sup):
- L1.append(item)
- all_frequent_sets.append([item])
- print("L1:", all_frequent_sets)
- flag = True
- counter_length = 2
- frequent_set = []
- for i in range(0, len(items)):
- dlugosc = len(items[i])
- j = 0
- while j < dlugosc:
- if items[i][j] not in L1:
- items[i].pop(j)
- dlugosc -= 1
- j += 1
- d2 = {'custID': custID, 'items': items}
- dataDF2 = pd.DataFrame(d2)
- newDFArray = []
- for index, row in dataDF2.iterrows():
- emptyArrayCheck = row["items"]
- if len(emptyArrayCheck) > 0:
- newDFArray.append(row)
- newDF = pd.DataFrame(newDFArray)
- print(newDF)
- while flag:
- current_candidates = combinations(L1, counter_length)
- current_candidates = np.array(list(current_candidates))
- for item in current_candidates:
- occurrences = FindFrequentSetInDataSet(item)
- if occurrences / DSPower >= min_sup:
- frequent_set.append(item)
- all_frequent_sets.append(list(item))
- print("L" + str(counter_length) + ":", np.array(list(frequent_set)))
- if len(frequent_set) == 0 or len(L1) - 1 <= counter_length:
- break
- frequent_set = []
- counter_length += 1
- frequent_dict = dict()
- number = 1
- for i in all_frequent_sets:
- d = {number: i}
- frequent_dict.update(d)
- number += 1
- columns = {'sets': all_frequent_sets, 'values': range(1, len(all_frequent_sets) + 1)}
- df_frequent = pd.DataFrame(columns)
- print("Odwzorowanie:")
- print(frequent_dict)
- # ODWZOROWANIE
- tmpTable = []
- # for item in newDF['items']:
- new_list = [[x] for x in newDF['items']]
- for item in new_list:
- if len(item[0]) > 1:
- tmp = item[:]
- z = [[x] for x in tmp[0]]
- y = []
- print(len(z))
- print(all_frequent_sets)
- z.append(item[0])
- for i in range(0, len(z)):
- if (z[i] in all_frequent_sets):
- y.append(z[i])
- print(y)
- item[0] = y[:]
- print(item[0])
- # x = combinations(item[0])
- print(new_list)
- for i in range(0, len(new_list)):
- if len(new_list[i][0]) > 1:
- new_list[i] = new_list[i][0]
- print(new_list)
- print(df_frequent)
- # for item in new_list:
- for i in range(0, len(new_list)):
- for j in range(0, len(new_list[i])):
- iterator = 0
- for mySet in df_frequent["sets"]:
- tmpValue = df_frequent.at[iterator, "values"]
- # print(tmpValue)
- if new_list[i][j] == mySet:
- new_list[i][j] = tmpValue
- break
- iterator += 1
- print(new_list)
- # df_frequent = df_frequent.set_index('sets')
- # print(list(frequent_dict.keys()[frequent_dict.values().index(['a'])]))
- newDF["items"] = new_list
- print(newDF)
- newDF = newDF.set_index(pd.Index(newDF["items"]))
- groupedData2 = newDF.groupby('custID').groups
- print(groupedData2)
- allCand = []
- for i in range(1, len(df_frequent["values"]) + 1):
- for j in range(1, len(df_frequent["values"]) + 1):
- tmp = [i, j]
- allCand.append(tmp)
- print(allCand)
- # (1,1), konkretny klient transakcje (1: [[1], [1]],).
- # wez pierwsze elem z (1,1) spr czy zawiera se w pierwszej trazakji jak tak occur +=1 CONTINUE , jak nie Continue
- frequent_formatted = []
- for cand in allCand:
- occurances = 0
- for i in range(1, max(newDF["custID"]) + 1):
- semi_occurance = 0
- counter = 0
- for tran in groupedData2[i]:
- if cand[counter] in tran:
- semi_occurance += 1
- counter += 1
- if counter > 1:
- break
- if semi_occurance > 1:
- occurances += 1
- support = occurances/max(newDF["custID"])
- if support > min_sup:
- frequent_set.append((cand , support))
- for i in frequent_set:
- print(i)