**UNSUPERVISED LEARNING:**
# KMeans only works with numerical values, no categorical
kmeans = KMeans(n_clusters = 2)
kmeans.fit(data)
labels_ = kmeans.labels_
print(labels_)
# here we import a library 'load.wine()'
data = datasets.load_wine()
X = pd.DataFrame(data['data'], columns = data['feature_names'])
X.head()
X_prep = StandardScaler().fit_transform(X)
pd.DataFrame(X_prep).head()
kmeans = KMeans(n_clusters = 8, random_state = 1234)
kmeans.fit(X_prep)
clusters = kmeans.predict(X_prep)
# Check the counting for each cluster
pd.Series(clusters).value_counts().sort_index()
0 13
1 23
2 34
3 22
4 25
5 19
6 3
7 39
dtype: int64
X_df = X
X_df['cluster'] = clusters
X_df.head()
# inertia = is the mean squared distance between the squared point and the centroid
# n_init‘auto’ or int, default=10
# Number of times the k-means algorithm is run with different centroid seeds.
# The final results is the best output of n_init consecutive runs in terms of inertia.
# Several runs are recommended for sparse high-dimensional problems.
# When n_init='auto', the number of runs depends on the value of init: 10 if using init='random',
# 1 if using init='k-means++'.
# conclusion = the lower the inertia of the module the better.
kmeans.inertia_
942.5743528467817
K = range(2,20)
inertia = []
for k in K:
kmeans = KMeans(n_clusters = k)
kmeans.fit(X_prep)
inertia.append(kmeans.inertia_)
inertia
[1659.0079672511501,
1277.9284888446423,
1175.3789993448738,
1104.0497155379066,
1051.1710663430124,
1000.0381460222904,
951.3403301993533,
898.2639263531403,
850.7066515662602,
838.2888191259143,
788.6861482918955,
769.02797058063,
750.9284997652168,
725.1390923689574,
698.7891160178722,
682.8488817244868,
668.9772688028743,
645.5247037724799]
# in this scenario, we choose the elbow of '10.0', because in this case we can give a more variety
# of wines
plt.figure(figsize = (15, 5))
plt.plot(K, inertia)
plt.show()
# . K-Means: Inertia
# . Inertia measures how well a dataset was clustered by K-Means.
# . It is calculated by measuring the distance between each data point and its centroid,
# squaring this distance, and summing these squares across one cluster.
# . A good model is one with low inertia AND a low number of clusters ( K ).
# 2 - Silhouette Score = silhouette score kmeans is a measure of how similar a data point is
# within-cluster (cohesion) compared to other clusters (separation).
K = range(2,20)
silhouette = []
for k in K:
kmeans = KMeans (n_clusters = k, random_state = 1234)
kmeans.fit(X_prep)
silhouette.append(silhouette_score(X_prep, kmeans.predict(X_prep)))
plt.figure(figsize = (15, 5))
plt.plot(K, silhouette)
plt.show()