import pandas as pd import numpy as np from itertools import combinations from itertools import permutations from prompt_toolkit.layout import Dimension # Data # DODAC wpisywanie min_sup !!!! min_sup = 0.25 custID = [1, 1, 2, 2, 2, 3, 4, 4, 4, 5] dateOrder = [1, 2, 1, 2, 3, 1, 1, 2, 3, 1] items = [['a'], ['a'], ['a'], ['b'], ['c', 'e'], ['a', 'e'], ['a'], ['c', 'd', 'e'], ['a'], ['a']] singleItems = ['a', 'b', 'c', 'd', 'e'] d = {'custID': custID, 'dateOrder': dateOrder, 'items': items} dataDF = pd.DataFrame(d) dataDF = dataDF.set_index(pd.Index(items)) print(dataDF) groupedData = dataDF.groupby('custID').groups print(groupedData) def isSubset(subset, set): if len(subset) > len(set): return False counter = 0 for sub_item in subset: if sub_item in set: counter += 1 return counter == len(subset) def SupportCounter(pattern): support = 0 for key in groupedData: shouldOccure = len(pattern) occurance = 0 for item in pattern: occurance += groupedData[key].count(item) if (occurance == shouldOccure): support = support + 1 occurance = 0 return support def FindFrequentSetInDataSet(itemToCheckIfFrequent): inHowManyGroupsItemOccured = 0 itemCounter = 0 for group in groupedData: itemCounterPerGroup = 0 for items in groupedData[group]: if isSubset(itemToCheckIfFrequent, items): itemCounter += 1 itemCounterPerGroup += 1 if (itemCounterPerGroup > 0): inHowManyGroupsItemOccured += 1 return inHowManyGroupsItemOccured for group in groupedData: groupedData[group] = pd.array(groupedData[group]) L1 = [] itemTable = singleItems[:] all_frequent_sets = [] DSPower = len(groupedData) for item in singleItems: occurancesInAllEvents = FindFrequentSetInDataSet(item) if (occurancesInAllEvents / DSPower >= min_sup): L1.append(item) all_frequent_sets.append([item]) print("L1:", all_frequent_sets) flag = True counter_length = 2 frequent_set = [] for i in range(0, len(items)): dlugosc = len(items[i]) j = 0 while j < dlugosc: if items[i][j] not in L1: items[i].pop(j) dlugosc -= 1 j += 1 d2 = {'custID': custID, 'items': items} dataDF2 = pd.DataFrame(d2) newDFArray = [] for index, row in dataDF2.iterrows(): emptyArrayCheck = row["items"] if len(emptyArrayCheck) > 0: newDFArray.append(row) newDF = pd.DataFrame(newDFArray) print(newDF) while flag: current_candidates = combinations(L1, counter_length) current_candidates = np.array(list(current_candidates)) for item in current_candidates: occurrences = FindFrequentSetInDataSet(item) if occurrences / DSPower >= min_sup: frequent_set.append(item) all_frequent_sets.append(list(item)) print("L" + str(counter_length) + ":", np.array(list(frequent_set))) if len(frequent_set) == 0 or len(L1) - 1 <= counter_length: break frequent_set = [] counter_length += 1 frequent_dict = dict() number = 1 for i in all_frequent_sets: d = {number: i} frequent_dict.update(d) number += 1 columns = {'sets': all_frequent_sets, 'values': range(1, len(all_frequent_sets) + 1)} df_frequent = pd.DataFrame(columns) print("Odwzorowanie:") print(frequent_dict) # ODWZOROWANIE tmpTable = [] # for item in newDF['items']: new_list = [[x] for x in newDF['items']] for item in new_list: if len(item[0]) > 1: tmp = item[:] z = [[x] for x in tmp[0]] y = [] print(len(z)) print(all_frequent_sets) z.append(item[0]) for i in range(0, len(z)): if (z[i] in all_frequent_sets): y.append(z[i]) print(y) item[0] = y[:] print(item[0]) # x = combinations(item[0]) print(new_list) for i in range(0, len(new_list)): if len(new_list[i][0]) > 1: new_list[i] = new_list[i][0] print(new_list) print(df_frequent) # for item in new_list: for i in range(0, len(new_list)): for j in range(0, len(new_list[i])): iterator = 0 for mySet in df_frequent["sets"]: tmpValue = df_frequent.at[iterator, "values"] # print(tmpValue) if new_list[i][j] == mySet: new_list[i][j] = tmpValue break iterator += 1 print(new_list) # df_frequent = df_frequent.set_index('sets') # print(list(frequent_dict.keys()[frequent_dict.values().index(['a'])])) newDF["items"] = new_list print(newDF) newDF = newDF.set_index(pd.Index(newDF["items"])) groupedData2 = newDF.groupby('custID').groups print(groupedData2) allCand = [] for i in range(1, len(df_frequent["values"]) + 1): for j in range(1, len(df_frequent["values"]) + 1): tmp = [i, j] allCand.append(tmp) print(allCand) # (1,1), konkretny klient transakcje (1: [[1], [1]],). # wez pierwsze elem z (1,1) spr czy zawiera se w pierwszej trazakji jak tak occur +=1 CONTINUE , jak nie Continue frequent_formatted = [] for cand in allCand: occurances = 0 for i in range(1, max(newDF["custID"]) + 1): semi_occurance = 0 counter = 0 for tran in groupedData2[i]: if cand[counter] in tran: semi_occurance += 1 counter += 1 if counter > 1: break if semi_occurance > 1: occurances += 1 support = occurances/max(newDF["custID"]) if support > min_sup: frequent_set.append((cand , support)) for i in frequent_set: print(i)