Facebook
From me, 2 Years ago, written in Python.
Embed
Download Paste or View Raw
Hits: 43
  1. sc.install_pypi_package("pandas==0.25.1") #Install pandas version 0.25.1
  2. sc.install_pypi_package("matplotlib", "https://pypi.org/simple") #Install matplotlib from given PyPI repository
  3. sc.install_pypi_package("seaborn")
  4. sc.install_pypi_package("scikit-learn")
  5. sc.install_pypi_package("distance")
  6.  
  7. import numpy as np
  8. import pandas as pd
  9. import matplotlib.pyplot as plt
  10. import seaborn as sns
  11. import distance
  12.  
  13. df = spark.read.csv("s3://work-connect-analyze-devel-iit/yamamoto/pattern_steps.01DP7Y3D235RHSXV1EVQHBERY0.202009.csv")
  14. df = df.toPandas()
  15. df = df.rename(columns=df.iloc[0])
  16. df = df[1:]
  17. #df
  18.  
  19.  
  20. from sklearn.preprocessing import LabelEncoder
  21. LE = LabelEncoder()
  22.  
  23. df['pattern_id'] = LE.fit_transform(df['pattern_id'])
  24.  
  25.  
  26. data = {}
  27. for g, d in df.groupby(['tenant_id', 'pattern_id']):
  28.     data[g[1]] = d.sort_values(['index'])['exe'].values.tolist()
  29. #data
  30.  
  31. def min_distance_by_sliding_window(a, b):
  32.     if len(a) == len(b):
  33.         return distance.hamming(a, b)
  34.     x = a if len(a) <= len(b) else b
  35.     y = b if len(a) <= len(b) else a
  36.     return min([distance.hamming(x, y[i: i + len(x)])
  37.                for i in range(len(y) - len(x))])
  38.  
  39. def stat_distance(a, b):
  40.     d_sliding_window = min_distance_by_sliding_window(a, b)
  41.     d_levenshtein = distance.levenshtein(a, b)
  42.    
  43.     factor = 0;
  44.     if a >= b :
  45.         factor = 1
  46.     else :
  47.         factor = -1
  48.    
  49.     return factor*np.mean(np.array([d_sliding_window, d_levenshtein])**2)
  50.  
  51. similarity_matrix = np.array([[stat_distance(x, y) for x in data.values()] for y in data.values()])
  52. print(similarity_matrix)  
  53.  
  54. total = 445
  55. k = 25
  56.  
  57. # clusters k
  58.  
  59. # centers         =  0           445/k + 1             2*445/k+1
  60. # pointsincluster = (0, 445/k)  (445/k + 1,  2*445/k)
  61.  
  62. centers = []
  63. idx = 0
  64. while True:
  65.     centers.append(list(data.items())[idx])
  66.     idx = idx + int(len(data)/k)
  67.     if idx > 444:
  68.         break
  69.  
  70. points = []
  71. idx = 0
  72. while True:
  73.     points.append(list(data.items())[idx])
  74.     idx = idx + 1
  75.     if idx > 444:
  76.         break
  77.        
  78.        
  79. pointsincluster = []
  80.  
  81. idx = 0
  82. while True:
  83.     temp = []
  84.     temp.append(list(data.items())[idx : min(idx + int(len(data)/k), len(data))])
  85.     pointsincluster.append(temp)    
  86.     idx = idx + int(len(data)/k)
  87.     if idx > 444:
  88.         break
  89.        
  90.        
  91. clusters = []
  92.  
  93. idx = 0
  94. while True:
  95.     clusters.append((centers[idx], pointsincluster[idx][0]))
  96.     idx = idx + 1
  97.     if idx == 27:
  98.         break
  99.        
  100.        
  101. clusters_num = []
  102.  
  103. def assign_points():
  104.    
  105.     point_idx = 0
  106.     while True:
  107.         center_idx = 0
  108.         min_val = 1e9
  109.         min_idx = -1
  110.         while True:            
  111.             if abs(similarity_matrix[center[center_idx][0]][points[point_idx][0]]) < min_val:
  112.                 min_val = abs(similarity_matrix[center[center_idx][0]][points[point_idx][0]])
  113.                 min_idx = center_idx
  114.             center_idx = center_idx + 1
  115.             if center_idx > 26 :
  116.                 break
  117.                
  118.         clusters_num.append(min_idx)    
  119.        
  120.         point_idx = point_idx + 1
  121.         if point_idx > 444 :
  122.             break;
  123.