import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import math

!pip install yellowbrick
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN
from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer

Requirement already satisfied: yellowbrick in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (1.5)
Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from yellowbrick) (3.8.1)
Requirement already satisfied: scipy>=1.0.0 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from yellowbrick) (1.11.3)
Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from yellowbrick) (1.3.2)
Requirement already satisfied: numpy>=1.16.0 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from yellowbrick) (1.26.4)
Requirement already satisfied: cycler>=0.10.0 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from yellowbrick) (0.12.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.2.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.44.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.5)
Requirement already satisfied: packaging>=20.0 in c:\users\admin\appdata\roaming\python\python311\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (23.1)
Requirement already satisfied: pillow>=8 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (10.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\admin\appdata\roaming\python\python311\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2)
Requirement already satisfied: joblib>=1.1.1 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\admin\appdata\local\programs\python\python311\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (3.2.0)
Requirement already satisfied: six>=1.5 in c:\users\admin\appdata\roaming\python\python311\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)

df = pd.read_csv('Mall_Customers.csv')
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB

df.dtypes

CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object

# Visualizing class distribution
class_counts = df['Gender'].value_counts()

# Plotting a pie chart
plt.figure(figsize=(6, 6))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', startangle=140, colors=['lightblue', 'pink'])
plt.title('Distribution of Gender')
plt.show()

df1 = df.drop('CustomerID', axis=1)

sns.set(font_scale=1)
sns.set_style('ticks')
sns.pairplot(df1, diag_kind='kde', hue='Gender', corner=True, height = 5)
plot_kws={"s": 100}

plt.show()

df2 = pd.get_dummies(df1, columns=['Gender'], drop_first=True)

df2 = pd.DataFrame(MinMaxScaler().fit_transform(df2), columns=df2.columns)
df2.head()

kmeans_model_1 = KMeans(init='k-means++', n_clusters=3, n_init=10, max_iter=300, random_state=37).fit(df2)

print(kmeans_model_1.inertia_)
print(kmeans_model_1.cluster_centers_)
print(kmeans_model_1.n_iter_)

29.552857611943857
[[1.97115385e-01 3.85245902e-01 7.21173469e-01 1.00000000e+00]
 [3.86504121e-01 3.62704918e-01 5.15579446e-01 4.44089210e-16]
 [6.04567308e-01 3.88661202e-01 2.87840136e-01 1.00000000e+00]]
10

Elbow_Chart = KElbowVisualizer(kmeans_model_1, k=(1, 11))
Elbow_Chart.fit(df2)
Elbow_Chart.draw()

<Axes: >

kmeans_model_2 = {'init': 'k-means++', 'n_init':10, 'max_iter':300, 'random_state':37,}
silhouette_coef = []

for k in range(2, 11):
    kmeans_silhouette = KMeans(n_clusters=k, **kmeans_model_2)
    kmeans_silhouette.fit(df2)
    score = silhouette_score(df2, kmeans_silhouette.labels_)
    silhouette_coef.append(score)
    
plt.style.use('Solarize_Light2')
plt.plot(range(2, 11), silhouette_coef)
plt.xticks(range(2, 11))
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient')
plt.show()

f, ax = plt.subplots(3, 2, figsize=(15, 15))

for i in range(2, 8):
    kmeans_model_3 = KMeans(init='k-means++', n_clusters=i, n_init=10, max_iter=300, random_state=37)
    q, mod = divmod(i, 2)
    
    visualizer = SilhouetteVisualizer(kmeans_model_3, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(df2)

kmeans_model_4 = KMeans(init='k-means++', n_clusters=4, n_init=10, max_iter=300, random_state=37)
df2['cluster1'] = kmeans_model_4.fit_predict(df2)

plt.figure(figsize=(8, 8))
for i in range(0, df2['cluster1'].max() + 1):
    plt.scatter(df2.loc[df2.cluster1 == i, 'Annual Income (k$)'], df2.loc[df2.cluster1 == i, 'Spending Score (1-100)'], label = 'cluster'+str(i))

plt.legend()
plt.title('K means visualization', size=12)
plt.xlabel('Annual Income (k$)', size=10)
plt.ylabel('Spending Score (1-100)', size=10)
plt.show()

DBSCAN_model = DBSCAN(eps=0.7, min_samples=5).fit(df2.drop('cluster1', axis=1))
df2['cluster2'] = DBSCAN_model.fit_predict(df2.drop('cluster1', axis=1))

plt.figure(figsize=(8, 8))

for i in range(0, df2['cluster2'].max() + 1):
    plt.scatter(df2.loc[df2.cluster2 == i, 'Annual Income (k$)'], df2.loc[df2.cluster2 == i, 'Spending Score (1-100)'], label = 'cluster'+str(i))

plt.legend()
plt.title('DBSCAN visualization', size=12)
plt.xlabel('Annual Income (k$)', size=10)
plt.ylabel('Spending Score (1-100)', size=10)
plt.show()

df_kmeans = df2.groupby(['cluster1']).agg({'Age':'mean', 'Annual Income (k$)':'mean', 'Spending Score (1-100)':'mean', 'Gender_Male':'mean'}).reset_index()
df_kmeans['cnt'] = df2.groupby('cluster1')['Age'].count()
df_kmeans.head()

df_DBSCAN = df2.groupby(['cluster2']).agg({'Age':'mean', 'Annual Income (k$)':'mean', 'Spending Score (1-100)':'mean', 'Gender_Male':'mean'}).reset_index()
df_DBSCAN['cnt'] = df2.groupby('cluster2')['Age'].count()
df_DBSCAN.head()

	Age	Annual Income (k$)	Spending Score (1-100)	Gender_Male
0	0.019231	0.000000	0.387755	1.0
1	0.057692	0.000000	0.816327	1.0
2	0.038462	0.008197	0.051020	0.0
3	0.096154	0.008197	0.775510	0.0
4	0.250000	0.016393	0.397959	0.0

K-Means Clustering of Mall Customers

Johann Sebastian Catalla, BSCS-II

About The Dataset¶

Data Exploration and Preparation

Feature Engineering

Building the Model

Model Evaluation

	CustomerID	Gender	Age	Annual Income (k$)	Spending Score (1-100)
0	1	Male	19	15	39
1	2	Male	21	15	81
2	3	Female	20	16	6
3	4	Female	23	16	77
4	5	Female	31	17	40

	cluster1	Age	Annual Income (k$)	Spending Score (1-100)	Gender_Male	cnt
0	0	0.604567	0.388661	0.287840	1.0	48
1	1	0.579021	0.359165	0.344712	0.0	55
2	2	0.197115	0.385246	0.721173	1.0	40
3	3	0.200742	0.366120	0.680451	0.0	57

	cluster2	Age	Annual Income (k$)	Spending Score (1-100)	Gender_Male	cnt
0	0	0.419362	0.387109	0.484810	1.0	88
1	1	0.386504	0.362705	0.515579	0.0	112