**# DBSCAN = density-based spatial clustering of applications with noise:**
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
# DBSCAN requires only two parameters: epsilon and minPoints.
# Epsilon is the radius of the circle to be created around each data point
# to check the density and minPoints is the minimum number of data points required inside
# that circle for that data point to be classified as a Core point.
# Process to find the Epsilon
# In this plot, we searching for the elbow which is around 4.0
# X = its the data points
# Y = the distance in between each point
distances
array([2.0847551 , 2.11838808, 2.14717458, 2.16074732, 2.17551081,
2.17930977, 2.17978959, 2.18103243, 2.18231952, 2.19404957,
2.20078274, 2.23112914, 2.25320308, 2.25392444, 2.26016396,
2.31332859, 2.32744184, 2.33280943, 2.33280943, 2.33480049,
2.35403621, 2.35733234, 2.36256452, 2.37187907, 2.38702297,
2.38997765, 2.39472528, 2.41123176, 2.41456633, 2.41485549,
2.4182579 , 2.42812807, 2.43138578, 2.4331107 , 2.43389867,
2.46066161, 2.46414882, 2.46665055, 2.47254301, 2.47355635,
2.47943401, 2.47987263, 2.48513511, 2.49594573, 2.50512773,
2.50870518, 2.51618667, 2.52132942, 2.52485457, 2.53939292,
2.5496691 , 2.55529268, 2.55621275, 2.57223206, 2.57236061,
2.5781926 , 2.58191018, 2.58845626, 2.59563063, 2.61540864,
2.6167305 , 2.62981556, 2.62994077, 2.62994077, 2.63102206,
2.63102206, 2.63983621, 2.64132752, 2.64168542, 2.64524885,
2.67561238, 2.67922764, 2.68512475, 2.69245409, 2.69721918,
2.70058162, 2.70088118, 2.71301599, 2.7157602 , 2.716663 ,
2.72279017, 2.72320913, 2.74636321, 2.75744396, 2.75894677,
2.77362763, 2.77362763, 2.78055721, 2.79686396, 2.80215255,
2.8031427 , 2.80910032, 2.81417689, 2.81800792, 2.82160905,
2.83295934, 2.83913471, 2.84250465, 2.85154218, 2.85234083,
2.8585593 , 2.85898348, 2.85939281, 2.8767392 , 2.87717705,
2.89659792, 2.89954197, 2.90212015, 2.90524103, 2.90677558,
2.93178856, 2.93178856, 2.94109433, 2.94486454, 2.95255402,
2.96169124, 2.97857229, 2.98047849, 2.99221634, 2.99267023,
2.99766378, 3.00936458, 3.00964096, 3.03655175, 3.04528731,
3.04642375, 3.06290883, 3.06922521, 3.07793942, 3.09770036,
3.11433827, 3.11822692, 3.11850408, 3.14061153, 3.1430512 ,
3.16750927, 3.17058369, 3.17113313, 3.18308832, 3.21396303,
3.21458016, 3.21842566, 3.21935285, 3.21954708, 3.28545271,
3.29334962, 3.31978342, 3.37458266, 3.39393028, 3.41363349,
3.41895422, 3.45622763, 3.479422 , 3.4935557 , 3.52602147,
3.57019033, 3.60587952, 3.65141525, 3.66035204, 3.72162071,
3.73816916, 3.77181725, 3.77991494, 3.85954861, 3.99339484,
4.00103218, 4.01169834, 4.01681451, 4.04193215, 4.34101454,
4.39465616, 4.83259114, 4.89088241, 5.00174458, 5.00513627,
5.2360924 , 5.31367718, 5.6955564 ])
k = len(X_df.columns) + 1
nearest_neighbors = NearestNeighbors(n_neighbors = k)
neighbors = nearest_neighbors.fit(X_prep)
distances, indices = neighbors.kneighbors(X_prep)
distances = np.sort(distances[:, -1], axis = 0)
plt.plot(distances)
plt.show()
**# CREATING DBSCAN MODULE:**
dbscan = DBSCAN(eps = 4, min_samples = k)
clusters = dbscan.fit_predict(X_prep)
X_df['clusters'] = clusters
X_df['clusters'].value_counts()
0 172
-1 6
Name: clusters, dtype: int64
# The -1 = 6 is the data point that didn't fit the clusters.
# Core point: A point is a core point if there are at least minPts number of points (including the point itself) in its surrounding area with radius eps.
# Border point: A point is a border point if it is reachable from a core point and there are less than minPts number of points within its surrounding area.
# Outlier: A point is an outlier if it is not a core point and not reachable from any core points.
# How to Save as a Pickle File
import pickle
from sklearn.preprocessing import StandardScaler
# Replace the StandardScaler by the scaler of your choice
scaler = StandardScaler()
scaler.fit(X)
scaler_filename = 'scaler_sd.pkl'
pickle.dump(scaler, open(scaler_filename, 'wb'))
X_prep = scaler.transform(X)
# This will take some time, especially if you have lots of columns
model = DBSCAN(eps = 3.5, min_samples = k)
model.fit( X_prep )
model_filename = 'dbscan.pkl'
pickle.dump(model, open(model_filename, 'wb'))
# Here we load the pickle files that we save previously
scaler_filename = 'scaler_sd.pkl'
loaded_scaler = pickle.load(open(scaler_filename, 'rb'))
model_filename = 'dbscan.pkl'
loaded_model = pickle.load(open(model_filename, 'rb'))