By using this site, you agree to our updated Privacy Policy and our Terms of Use. Manage your Cookies Settings.
434,825 Members | 2,437 Online
Bytes IT Community
+ Ask a Question
Need help? Post your question and get tips & solutions from a community of 434,825 IT Pros & Developers. It's quick & easy.

How to use clustering evaluation elbow method in K-Medoids

P: 2
i use this code as my reference :


Expand|Select|Wrap|Line Numbers
  1. import pylab as plt
  2. import numpy as np
  3. from scipy.spatial.distance import cdist, pdist
  4. from sklearn.cluster import KMeans
  5. from sklearn.datasets import load_iris
  6.  
  7. iris = load_iris()
  8.  
  9. k = range(1,11)
  10.  
  11. clusters = [KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k]
  12. centr_lst = [cc.cluster_centers_ for cc in clusters]
  13.  
  14. k_distance = [cdist(iris.data, cent, 'euclidean') for cent in centr_lst]
  15. clust_indx = [np.argmin(kd,axis=1) for kd in k_distance]
  16. distances = [np.min(kd,axis=1) for kd in k_distance]
  17. avg_within = [np.sum(dist)/iris.data.shape[0] for dist in distances]
  18.  
  19. with_in_sum_square = [np.sum(dist ** 2) for dist in distances]
  20. to_sum_square = np.sum(pdist(iris.data) ** 2)/iris.data.shape[0]
  21. bet_sum_square = to_sum_square - with_in_sum_square
  22.  
  23. kidx = 2
  24.  
  25. fig = plt.figure()
  26. ax = fig.add_subplot(111)
  27. ax.plot(k, avg_within, 'g*-')
  28. ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, \
  29. markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
  30. plt.grid(True)
  31. plt.xlabel('Number of clusters')
  32. plt.ylabel('Average within-cluster sum of squares')
  33. plt.title('Elbow for KMeans clustering (IRIS Data)')
  34.  
i want to change K-Means with K-Medoids.
this is my k-medoids code :
Expand|Select|Wrap|Line Numbers
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import matplotlib.cm as cm
  4. from copy import deepcopy
  5. from IPython import embed
  6. import time
  7.  
  8. def _get_init_centers(n_clusters, n_samples):
  9.     '''return random points as initial centers'''
  10.     init_ids = []
  11.     while len(init_ids) < n_clusters:
  12.         _ = np.random.randint(0,n_samples)
  13.         if not _ in init_ids:
  14.             init_ids.append(_)
  15.     return init_ids
  16.  
  17. def _get_distance(data1, data2):
  18.     '''example distance function'''
  19.     return np.sqrt(np.sum((data1 - data2)**2))
  20.  
  21. def _get_cost(X, centers_id, dist_func):
  22.     '''return total cost and cost of each cluster'''
  23.     st = time.time()
  24.     dist_mat = np.zeros((len(X),len(centers_id)))
  25.     # compute distance matrix
  26.     for j in range(len(centers_id)):
  27.         center = X[centers_id[j],:]
  28.         for i in range(len(X)):
  29.             if i == centers_id[j]:
  30.                 dist_mat[i,j] = 0.
  31.             else:
  32.                 dist_mat[i,j] = dist_func(X[i,:], center)
  33.     #print 'cost ', -st+time.time()
  34.     mask = np.argmin(dist_mat,axis=1)
  35.     members = np.zeros(len(X))
  36.     costs = np.zeros(len(centers_id))
  37.     for i in range(len(centers_id)):
  38.         mem_id = np.where(mask==i)
  39.         members[mem_id] = i
  40.         costs[i] = np.sum(dist_mat[mem_id,i])
  41.     return members, costs, np.sum(costs), dist_mat
  42.  
  43. def _kmedoids_run(X, n_clusters, dist_func, max_iter=3, tol=0.000001, verbose=True):
  44.     '''run algorithm return centers, members, and etc.'''
  45.     # Get initial centers
  46.     n_samples, n_features = X.shape
  47.     init_ids = _get_init_centers(n_clusters,n_samples)
  48.     if verbose:
  49.         print 'Initial centers are ', init_ids
  50.     centers = init_ids
  51.     members, costs, tot_cost, dist_mat = _get_cost(X, init_ids,dist_func)
  52.     cc,SWAPED = 0, True
  53.     while True:
  54.         SWAPED = False
  55.         for i in range(n_samples):
  56.             if not i in centers:
  57.                 for j in range(len(centers)):
  58.                     centers_ = deepcopy(centers)
  59.                     centers_[j] = i
  60.                     members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, centers_,dist_func)
  61.                     if tot_cost_-tot_cost < tol:
  62.                         members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
  63.                         centers = centers_
  64.                         SWAPED = True
  65.                         if verbose:
  66.                             print 'Change centers to ', centers
  67.         if cc > max_iter:
  68.             if verbose:
  69.                 print 'End Searching by reaching maximum iteration', max_iter
  70.             break
  71.         if not SWAPED:
  72.             if verbose:
  73.                 print 'End Searching by no swaps'
  74.             break
  75.         cc += 1
  76.     return centers,members, costs, tot_cost, dist_mat
  77.  
  78. class KMedoids(object):
  79.     '''
  80.     Main API of KMedoids Clustering
  81.  
  82.     Parameters
  83.     --------
  84.         n_clusters: number of clusters
  85.         dist_func : distance function
  86.         max_iter: maximum number of iterations
  87.         tol: tolerance
  88.  
  89.     Attributes
  90.     --------
  91.         labels_    :  cluster labels for each data item
  92.         centers_   :  cluster centers id
  93.         costs_     :  array of costs for each cluster
  94.         n_iter_    :  number of iterations for the best trail
  95.  
  96.     Methods
  97.     -------
  98.         fit(X): fit the model
  99.             - X: 2-D numpy array, size = (n_sample, n_features)
  100.  
  101.         predict(X): predict cluster id given a test dataset.
  102.     '''
  103.     def __init__(self, n_clusters, dist_func=_get_distance, max_iter=3, tol=0.000001):
  104.         self.n_clusters = n_clusters
  105.         self.dist_func = dist_func
  106.         self.max_iter = max_iter
  107.         self.tol = tol
  108.  
  109.     def fit(self, X, plotit=True, verbose=True):
  110.         centers, members, costs, tot_cost, dist_mat = _kmedoids_run(
  111.             X, self.n_clusters, self.dist_func, max_iter=self.max_iter, tol=self.tol, verbose=verbose)
  112.         if plotit:
  113.             fig = plt.figure()
  114.             ax = fig.add_subplot(111)
  115.  
  116.  
  117.             for i in range(len(centers)):
  118.                 X_c = X[members == i, :]
  119.                 ax.scatter(X_c[:, 0], X_c[:, 1], label = i+1,alpha=0.5, s=30)
  120.                 ax.scatter(X[centers[i], 0], X[centers[i], 1],alpha=1., s=250, marker='*')
  121.             #ax.legend(bbox_to_anchor=(1, 1), fontsize="small", loc=2, borderaxespad=0.)
  122.             colormap = plt.cm.gist_ncar  # nipy_spectral, Set1,Paired
  123.             colorst = [colormap(i) for i in np.linspace(0, 0.9, len(ax.collections))]
  124.             for t, j1 in enumerate(ax.collections):
  125.                 j1.set_color(colorst[t])
  126.  
  127.         return
  128.  
  129.  
  130.     def predict(self,X):
  131.         raise NotImplementedError()
  132.  
  133.  
could you halp how to do it?
thanks
Oct 24 '16 #1
Share this question for a faster answer!
Share on Google+

Post your reply

Sign in to post your reply or Sign up for a free account.