import pandas as pd


url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

col_names = ['sepal-length', 'sepal-width', 'petal-length',
             'petal-width', 'class']

iris_df = pd.read_csv(url, names=col_names)

iris_df.head()


# how many iris classes (unique?)
iris_df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)


# X is all columns except class
X = iris_df.iloc[:,:4]

# y is the class column
y = iris_df[ ['class'] ]

display(X.head())
display(y)


# use KMeans from sklearn.cluster
from sklearn.cluster import KMeans

k_means_model = KMeans(n_clusters=3)

k_means_model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


# display the labels the model determined
k_means_model.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1])


# let's create a new "verification_data" DataFrame, with the same class encoding as the model appears to use
verification_data = iris_df[ ['class'] ].copy()

verification_data['label'] = 0   # This is the code for Iris-setosa
verification_data.loc[ verification_data['class'] == 'Iris-versicolor', 'label'] = 1
verification_data.loc[ verification_data['class'] == 'Iris-virginica', 'label'] = 2

verification_data


# create a confusion matrix with sklearn.metrics
from sklearn.metrics import confusion_matrix

print(confusion_matrix(k_means_model.labels_, verification_data['label']))

[[50  0  0]
 [ 0 48 14]
 [ 0  2 36]]


%matplotlib inline
import matplotlib.pyplot as plt

#Finding the optimum number of clusters for k-means classification
wcss = []

# loop over a range of cluster amounts, e.g. 1 to 11, create and train a model for each
# append the "inertia_" property of the model to the wcss list
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    
# Plotting the results onto a line graph, allowing us to observe 'The elbow'
# In this case we want to plot our range of cluster amounts (1 to 11) against the inertia values
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()

	sepal-length	sepal-width	petal-length	petal-width	class
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepal-length	sepal-width	petal-length	petal-width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

Simple Unsupervised Learning Example¶

KMeans clustering model¶

Verify the model against the original labels¶

How to determine the number of clusters?¶

	sepal-length	sepal-width	petal-length	petal-width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	class
0	Iris-setosa
1	Iris-setosa
2	Iris-setosa
3	Iris-setosa
4	Iris-setosa
...	...
145	Iris-virginica
146	Iris-virginica
147	Iris-virginica
148	Iris-virginica
149	Iris-virginica

	class	label
0	Iris-setosa	0
1	Iris-setosa	0
2	Iris-setosa	0
3	Iris-setosa	0
4	Iris-setosa	0
...	...	...
145	Iris-virginica	2
146	Iris-virginica	2
147	Iris-virginica	2
148	Iris-virginica	2
149	Iris-virginica	2

	sepal-length	sepal-width	petal-length	petal-width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal-length	sepal-width	petal-length	petal-width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2