sc.install_pypi_package("pandas==0.25.1") #Install pandas version 0.25.1
sc.install_pypi_package("matplotlib", "https://pypi.org/simple") #Install matplotlib from given PyPI repository
sc.install_pypi_package("seaborn")
sc.install_pypi_package("scikit-learn")
sc.install_pypi_package("distance")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import distance
df = spark.read.csv("s3://work-connect-analyze-devel-iit/yamamoto/pattern_steps.01DP7Y3D235RHSXV1EVQHBERY0.202009.csv")
df = df.toPandas()
df = df.rename(columns=df.iloc[0])
df = df[1:]
#df
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df['pattern_id'] = LE.fit_transform(df['pattern_id'])
data = {}
for g, d in df.groupby(['tenant_id', 'pattern_id']):
data[g[1]] = d.sort_values(['index'])['exe'].values.tolist()
#data
def min_distance_by_sliding_window(a, b):
if len(a) == len(b):
return distance.hamming(a, b)
x = a if len(a) <= len(b) else b
y = b if len(a) <= len(b) else a
return min([distance.hamming(x, y[i: i + len(x)])
for i in range(len(y) - len(x))])
def stat_distance(a, b):
d_sliding_window = min_distance_by_sliding_window(a, b)
d_levenshtein = distance.levenshtein(a, b)
factor = 0;
if a >= b :
factor = 1
else :
factor = -1
return factor*np.mean(np.array([d_sliding_window, d_levenshtein])**2)
similarity_matrix = np.array([[stat_distance(x, y) for x in data.values()] for y in data.values()])
print(similarity_matrix)
total = 445
k = 25
# clusters k
# centers = 0 445/k + 1 2*445/k+1
# pointsincluster = (0, 445/k) (445/k + 1, 2*445/k)
centers = []
idx = 0
while True:
centers.append(list(data.items())[idx])
idx = idx + int(len(data)/k)
if idx > 444:
break
points = []
idx = 0
while True:
points.append(list(data.items())[idx])
idx = idx + 1
if idx > 444:
break
pointsincluster = []
idx = 0
while True:
temp = []
temp.append(list(data.items())[idx : min(idx + int(len(data)/k), len(data))])
pointsincluster.append(temp)
idx = idx + int(len(data)/k)
if idx > 444:
break
clusters = []
idx = 0
while True:
clusters.append((centers[idx], pointsincluster[idx][0]))
idx = idx + 1
if idx == 27:
break
clusters_num = []
def assign_points():
point_idx = 0
while True:
center_idx = 0
min_val = 1e9
min_idx = -1
while True:
if abs(similarity_matrix[center[center_idx][0]][points[point_idx][0]]) < min_val:
min_val = abs(similarity_matrix[center[center_idx][0]][points[point_idx][0]])
min_idx = center_idx
center_idx = center_idx + 1
if center_idx > 26 :
break
clusters_num.append(min_idx)
point_idx = point_idx + 1
if point_idx > 444 :
break;
{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}