Expand|Select|Wrap|Line Numbers
- import pylab as plt
- import numpy as np
- from scipy.spatial.distance import cdist, pdist
- from sklearn.cluster import KMeans
- from sklearn.datasets import load_iris
- iris = load_iris()
- k = range(1,11)
- clusters = [KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k]
- centr_lst = [cc.cluster_centers_ for cc in clusters]
- k_distance = [cdist(iris.data, cent, 'euclidean') for cent in centr_lst]
- clust_indx = [np.argmin(kd,axis=1) for kd in k_distance]
- distances = [np.min(kd,axis=1) for kd in k_distance]
- avg_within = [np.sum(dist)/iris.data.shape[0] for dist in distances]
- with_in_sum_square = [np.sum(dist ** 2) for dist in distances]
- to_sum_square = np.sum(pdist(iris.data) ** 2)/iris.data.shape[0]
- bet_sum_square = to_sum_square - with_in_sum_square
- kidx = 2
- fig = plt.figure()
- ax = fig.add_subplot(111)
- ax.plot(k, avg_within, 'g*-')
- ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, \
- markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
- plt.grid(True)
- plt.xlabel('Number of clusters')
- plt.ylabel('Average within-cluster sum of squares')
- plt.title('Elbow for KMeans clustering (IRIS Data)')
this is my k-medoids code :
Expand|Select|Wrap|Line Numbers
- import numpy as np
- import matplotlib.pyplot as plt
- import matplotlib.cm as cm
- from copy import deepcopy
- from IPython import embed
- import time
- def _get_init_centers(n_clusters, n_samples):
- '''return random points as initial centers'''
- init_ids = []
- while len(init_ids) < n_clusters:
- _ = np.random.randint(0,n_samples)
- if not _ in init_ids:
- init_ids.append(_)
- return init_ids
- def _get_distance(data1, data2):
- '''example distance function'''
- return np.sqrt(np.sum((data1 - data2)**2))
- def _get_cost(X, centers_id, dist_func):
- '''return total cost and cost of each cluster'''
- st = time.time()
- dist_mat = np.zeros((len(X),len(centers_id)))
- # compute distance matrix
- for j in range(len(centers_id)):
- center = X[centers_id[j],:]
- for i in range(len(X)):
- if i == centers_id[j]:
- dist_mat[i,j] = 0.
- else:
- dist_mat[i,j] = dist_func(X[i,:], center)
- #print 'cost ', -st+time.time()
- mask = np.argmin(dist_mat,axis=1)
- members = np.zeros(len(X))
- costs = np.zeros(len(centers_id))
- for i in range(len(centers_id)):
- mem_id = np.where(mask==i)
- members[mem_id] = i
- costs[i] = np.sum(dist_mat[mem_id,i])
- return members, costs, np.sum(costs), dist_mat
- def _kmedoids_run(X, n_clusters, dist_func, max_iter=3, tol=0.000001, verbose=True):
- '''run algorithm return centers, members, and etc.'''
- # Get initial centers
- n_samples, n_features = X.shape
- init_ids = _get_init_centers(n_clusters,n_samples)
- if verbose:
- print 'Initial centers are ', init_ids
- centers = init_ids
- members, costs, tot_cost, dist_mat = _get_cost(X, init_ids,dist_func)
- cc,SWAPED = 0, True
- while True:
- SWAPED = False
- for i in range(n_samples):
- if not i in centers:
- for j in range(len(centers)):
- centers_ = deepcopy(centers)
- centers_[j] = i
- members_, costs_, tot_cost_, dist_mat_ = _get_cost(X, centers_,dist_func)
- if tot_cost_-tot_cost < tol:
- members, costs, tot_cost, dist_mat = members_, costs_, tot_cost_, dist_mat_
- centers = centers_
- SWAPED = True
- if verbose:
- print 'Change centers to ', centers
- if cc > max_iter:
- if verbose:
- print 'End Searching by reaching maximum iteration', max_iter
- break
- if not SWAPED:
- if verbose:
- print 'End Searching by no swaps'
- break
- cc += 1
- return centers,members, costs, tot_cost, dist_mat
- class KMedoids(object):
- '''
- Main API of KMedoids Clustering
- Parameters
- --------
- n_clusters: number of clusters
- dist_func : distance function
- max_iter: maximum number of iterations
- tol: tolerance
- Attributes
- --------
- labels_ : cluster labels for each data item
- centers_ : cluster centers id
- costs_ : array of costs for each cluster
- n_iter_ : number of iterations for the best trail
- Methods
- -------
- fit(X): fit the model
- - X: 2-D numpy array, size = (n_sample, n_features)
- predict(X): predict cluster id given a test dataset.
- '''
- def __init__(self, n_clusters, dist_func=_get_distance, max_iter=3, tol=0.000001):
- self.n_clusters = n_clusters
- self.dist_func = dist_func
- self.max_iter = max_iter
- self.tol = tol
- def fit(self, X, plotit=True, verbose=True):
- centers, members, costs, tot_cost, dist_mat = _kmedoids_run(
- X, self.n_clusters, self.dist_func, max_iter=self.max_iter, tol=self.tol, verbose=verbose)
- if plotit:
- fig = plt.figure()
- ax = fig.add_subplot(111)
- for i in range(len(centers)):
- X_c = X[members == i, :]
- ax.scatter(X_c[:, 0], X_c[:, 1], label = i+1,alpha=0.5, s=30)
- ax.scatter(X[centers[i], 0], X[centers[i], 1],alpha=1., s=250, marker='*')
- #ax.legend(bbox_to_anchor=(1, 1), fontsize="small", loc=2, borderaxespad=0.)
- colormap = plt.cm.gist_ncar # nipy_spectral, Set1,Paired
- colorst = [colormap(i) for i in np.linspace(0, 0.9, len(ax.collections))]
- for t, j1 in enumerate(ax.collections):
- j1.set_color(colorst[t])
- return
- def predict(self,X):
- raise NotImplementedError()
thanks