import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
# Your Kmeans class goes here
class Kmeans:
'''Implementing Kmeans algorithm.'''
def __init__(self, n_clusters, max_iter=100, random_state=123):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.random_state = random_state
def initializ_centroids(self, X):
np.random.RandomState(self.random_state)
random_idx = np.random.permutation(X.shape[0])
centroids = X[random_idx[:self.n_clusters]]
return centroids
def compute_centroids(self, X, labels):
centroids = np.zeros((self.n_clusters, X.shape[1]))
for k in range(self.n_clusters):
centroids[k, :] = np.mean(X[labels == k, :], axis=0)
return centroids
def compute_distance(self, X, centroids):
distance = np.zeros((X.shape[0], self.n_clusters))
for k in range(self.n_clusters):
row_norm = norm(X - centroids[k, :], axis=1)
distance[:, k] = np.square(row_norm)
return distance
def find_closest_cluster(self, distance):
return np.argmin(distance, axis=1)
def compute_sse(self, X, labels, centroids):
distance = np.zeros(X.shape[0])
for k in range(self.n_clusters):
distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
return np.sum(np.square(distance))
def fit(self, X):
self.centroids = self.initializ_centroids(X)
for i in range(self.max_iter):
old_centroids = self.centroids.copy()
distance = self.compute_distance(X, old_centroids)
self.labels = self.find_closest_cluster(distance)
self.centroids = self.compute_centroids(X, self.labels)
if np.all(old_centroids == self.centroids):
break
self.error = self.compute_sse(X, self.labels, self.centroids)
def predict(self, X):
distance = self.compute_distance(X, self.centroids)
return self.find_closest_cluster(distance)
# Load dataset (unsupervised, so no labels used)
iris = load_iris()
X = iris.data # shape (150, 4)
# Instantiate and fit Kmeans
kmeans = Kmeans(n_clusters=3, max_iter=100, random_state=42)
kmeans.fit(X)
print("Cluster centers:\n", kmeans.centroids)
print("Cluster labels:", kmeans.labels)
print("SSE:", kmeans.error)
# Optional: visualize in 2D with first two features
plt.scatter(X[:, 0], X[:, 1], c=kmeans.labels, cmap='viridis', marker='o', alpha=0.6)
plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1], s=200, c='red', marker='X')
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("K-means clustering on Iris dataset (unsupervised)")
plt.show()