DBSCAN | Notion


**# DBSCAN = density-based spatial clustering of applications with noise:**

from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

# DBSCAN requires only two parameters: epsilon and minPoints. 
# Epsilon is the radius of the circle to be created around each data point 
# to check the density and minPoints is the minimum number of data points required inside 
# that circle for that data point to be classified as a Core point.

# Process to find the Epsilon

# In this plot, we searching for the elbow which is around 4.0

# X = its the data points

# Y = the distance in between each point

distances
array([2.0847551 , 2.11838808, 2.14717458, 2.16074732, 2.17551081,
       2.17930977, 2.17978959, 2.18103243, 2.18231952, 2.19404957,
       2.20078274, 2.23112914, 2.25320308, 2.25392444, 2.26016396,
       2.31332859, 2.32744184, 2.33280943, 2.33280943, 2.33480049,
       2.35403621, 2.35733234, 2.36256452, 2.37187907, 2.38702297,
       2.38997765, 2.39472528, 2.41123176, 2.41456633, 2.41485549,
       2.4182579 , 2.42812807, 2.43138578, 2.4331107 , 2.43389867,
       2.46066161, 2.46414882, 2.46665055, 2.47254301, 2.47355635,
       2.47943401, 2.47987263, 2.48513511, 2.49594573, 2.50512773,
       2.50870518, 2.51618667, 2.52132942, 2.52485457, 2.53939292,
       2.5496691 , 2.55529268, 2.55621275, 2.57223206, 2.57236061,
       2.5781926 , 2.58191018, 2.58845626, 2.59563063, 2.61540864,
       2.6167305 , 2.62981556, 2.62994077, 2.62994077, 2.63102206,
       2.63102206, 2.63983621, 2.64132752, 2.64168542, 2.64524885,
       2.67561238, 2.67922764, 2.68512475, 2.69245409, 2.69721918,
       2.70058162, 2.70088118, 2.71301599, 2.7157602 , 2.716663  ,
       2.72279017, 2.72320913, 2.74636321, 2.75744396, 2.75894677,
       2.77362763, 2.77362763, 2.78055721, 2.79686396, 2.80215255,
       2.8031427 , 2.80910032, 2.81417689, 2.81800792, 2.82160905,
       2.83295934, 2.83913471, 2.84250465, 2.85154218, 2.85234083,
       2.8585593 , 2.85898348, 2.85939281, 2.8767392 , 2.87717705,
       2.89659792, 2.89954197, 2.90212015, 2.90524103, 2.90677558,
       2.93178856, 2.93178856, 2.94109433, 2.94486454, 2.95255402,
       2.96169124, 2.97857229, 2.98047849, 2.99221634, 2.99267023,
       2.99766378, 3.00936458, 3.00964096, 3.03655175, 3.04528731,
       3.04642375, 3.06290883, 3.06922521, 3.07793942, 3.09770036,
       3.11433827, 3.11822692, 3.11850408, 3.14061153, 3.1430512 ,
       3.16750927, 3.17058369, 3.17113313, 3.18308832, 3.21396303,
       3.21458016, 3.21842566, 3.21935285, 3.21954708, 3.28545271,
       3.29334962, 3.31978342, 3.37458266, 3.39393028, 3.41363349,
       3.41895422, 3.45622763, 3.479422  , 3.4935557 , 3.52602147,
       3.57019033, 3.60587952, 3.65141525, 3.66035204, 3.72162071,
       3.73816916, 3.77181725, 3.77991494, 3.85954861, 3.99339484,
       4.00103218, 4.01169834, 4.01681451, 4.04193215, 4.34101454,
       4.39465616, 4.83259114, 4.89088241, 5.00174458, 5.00513627,
       5.2360924 , 5.31367718, 5.6955564 ])

k = len(X_df.columns) + 1
nearest_neighbors = NearestNeighbors(n_neighbors = k)
neighbors = nearest_neighbors.fit(X_prep)
distances, indices = neighbors.kneighbors(X_prep)
distances = np.sort(distances[:, -1], axis = 0)
plt.plot(distances)
plt.show()

**# CREATING DBSCAN MODULE:**

dbscan = DBSCAN(eps = 4, min_samples = k)
clusters = dbscan.fit_predict(X_prep)
X_df['clusters'] = clusters

X_df['clusters'].value_counts()
0    172
-1      6
Name: clusters, dtype: int64

# The -1 = 6 is the data point that didn't fit the clusters.

# Core point: A point is a core point if there are at least minPts number of points (including the point itself) in its surrounding area with radius eps.

# Border point: A point is a border point if it is reachable from a core point and there are less than minPts number of points within its surrounding area.

# Outlier: A point is an outlier if it is not a core point and not reachable from any core points.

# How to Save as a Pickle File

import pickle
from sklearn.preprocessing import StandardScaler

# Replace the StandardScaler by the scaler of your choice
scaler = StandardScaler()
scaler.fit(X)

scaler_filename = 'scaler_sd.pkl'
pickle.dump(scaler, open(scaler_filename, 'wb'))

X_prep = scaler.transform(X)

# This will take some time, especially if you have lots of columns

model = DBSCAN(eps = 3.5, min_samples = k)
model.fit( X_prep )

model_filename = 'dbscan.pkl'

pickle.dump(model, open(model_filename, 'wb'))

# Here we load the pickle files that we save previously

scaler_filename = 'scaler_sd.pkl'
loaded_scaler = pickle.load(open(scaler_filename, 'rb'))

model_filename = 'dbscan.pkl'
loaded_model = pickle.load(open(model_filename, 'rb'))