**# Example of Logistic Regression:**
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import Normalizer
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
!pip install imblearn
from imblearn.over_sampling import SMOTE
#Importing libraries
password = getpass.getpass()
connection_string = 'mysql+pymysql://root:' + password + '@localhost/bank'
engine = create_engine(connection_string)
query = '''
with client_summary as (
select
account_id,
avg(amount) as avg_trans_amount,
count(trans_id) as trans_count,
avg(balance) as avg_balance
from trans
group by 1
)
select
c.client_id,
left(c.birth_number, 2) as age,
c.district_id,
a.frequency,
convert(a.date,date) as account_start_date,
convert(left(cd.issued,6),date) as card_issued_date,
convert(l.date,date) as loan_start_date,
datediff(convert(l.date,date),convert(a.date,date)) as days_between,
cs.avg_trans_amount,
cs.avg_balance,
cs.trans_count,
cd.type as card_type,
l.amount as loan_amount,
l.duration as loan_duration,
l.payments as loan_payments,
l.status as loan_status
from client c
join disp d on c.client_id = d.client_id
join account a on d.account_id = a.account_id
join loan l on l.account_id = a.account_id
left join card cd on cd.disp_id = d.disp_id
join client_summary cs on cs.account_id = a.account_id
'''
#Here we are exporting a query from sql bank schema to create the module
data = pd.read_sql_query(query, engine)
data.head()
# loan_status = (Y = predict)
# A: Contract finished, no problems.
# B: Contract finished, loan not paid.
# C: Running contract, okey so far.
# D: Running contract, client in debt.
data['loan_status'].value_counts()
#Checking how many values on the column 'loan_status'
data.shape
# 827 Row's
# 16 Columns
data.dtypes
# Checking the types of the columns
data.isna().sum()
# Checking if there's any Null Values
data = data.drop(['card_issued_date','card_type'],axis=1)
#Droping the columns with Null Values
data['age'] = data['age'].astype('int')
#Changing the column type 'age' a 'int'
data['account_start_date'] = pd.to_datetime(data['account_start_date'])
#Changing the column 'account_start_date' to Datetime format
data['loan_start_date'] = pd.to_datetime(data['loan_start_date'])
#Changing the column 'loan_start_date' to Datetime format
data['age'] = data['age'].astype('int')
data['account_start_date'] = pd.to_datetime(data['account_start_date'])
data['loan_start_date'] = pd.to_datetime(data['loan_start_date'])
data['account_start_date'] = data['account_start_date'].map(dt.datetime.toordinal)
data['loan_start_date'] = pd.to_datetime(data['loan_start_date']).map(dt.datetime.toordinal)
# in here we ordering with "map" function
data['district_id'] = data['district_id'].astype('object')
#Changing the type of the column 'district_id' as an object
data = data.drop('client_id',axis=1)
#Droping the column 'client_id' being irrelevant for our module
data.dtypes
#Checking again our changes
data.isna().sum()
#Checking again the cleaning of our Null Values
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, annot_kws={"fontsize":8})
plt.show()
#Showing the Heatmap to see the High Correlation
#On the map we see that 'account_start_date' and 'loan_start_date',
#are High correlated to each other and need to be dropped
data = data.drop(['account_start_date','loan_start_date'],axis=True)
#here we are dropping the column's 'account_start_date' and 'loan_start_date'
#because it is highly correlated
sns.histplot(data['avg_trans_amount'])
plt.show()
sns.histplot(data['avg_balance'])
plt.show()
sns.histplot(data['days_between'])
plt.show()
sns.histplot(data['trans_count'])
plt.show()
sns.histplot(x['avg_trans_amount'])
plt.show()
sns.pairplot(data)
plt.show()
X = data.select_dtypes(include = np.number)
X.head()
#Filtering all the numerical values in our Data
transformer = Normalizer().fit(X)
x_normalized = transformer.transform(X)
x = pd.DataFrame(x_normalized)
x.columns = X.columns
x.head()
#fitting the normalizer to our data
#creating a variable transformer to Normalize our DataFrame
#setting up our variable's
cat = data.select_dtypes(include = object)
cat
#here we check the dtypes on our data for the categorical's
cat = cat.drop(['loan_status'],axis=1)
#here we drop the column 'loan_status' to ensure the accuracy of our module
categorical = pd.get_dummies(cat,columns=['district_id','frequency'])
categorical
#here we aplly with 'pd.get_dummies' as label encoding
X = np.concatenate((X,categorical),axis=1)
y = data['loan_status']
#Setting up our X and Y for our module and for the test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4)
#Setting up the X and y variable's and prepare for the train test
classification = LogisticRegression().fit(X_train,y_train)
#Here we apply from the library 'LogisticRegression()' to fit the X and y
predictions = classification.predict(X_test)
predictions
#Setting up the predictions variable for the X_test predict
classification.score(X_test,y_test)
#Here we show the r_score
y_test.value_counts()
#Here we check again for the value's to see our module test predictions
pd.Series(predictions).value_counts()
#Here we check again for the value's to see our module test predictions
confusion_matrix(y_test, predictions)
# Accuracy / Imbalance
# Precision = TP / (TP(True Positive) + FP(False Positive))
# Recall = TP / (TP(True Positive) + FN(False Negative))
# Imbalanced classification refers to a classification predictive modeling problem
# where the number of examples in the
# training dataset for each class label is not balanced. That is, where the class
# distribution is not equal or close to equal,
# and is instead biased or skewed.
cm = confusion_matrix(y_test, predictions, labels=classification.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=classification.classes_)
disp.plot()
plt.show()
np.diag(cm).sum()
cm.sum()
np.diag(cm).sum() / cm.sum()
churnData = pd.read_csv('customer_churn.csv')
churnData.head(5)
churnData['Churn'].value_counts()
# As you can see there is a huge imbalance in the representation of the two categories
numericData = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
Y = pd.DataFrame(data=churnData, columns=['Churn'])
transformer = StandardScaler().fit(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression().fit(scaled_x, churnData['Churn'])
classification.score(scaled_x, churnData['Churn'])
yes = churnData[churnData['Churn']=='Yes']
no = churnData[churnData['Churn']=='No']
yes = yes.sample(500)
data = pd.concat([yes,no], axis=0)
print(data['Churn'].value_counts())
data.head()
numericData = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(data[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(data[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression().fit(scaled_x, data['Churn'])
classification.score(scaled_x, data['Churn'])
counts = churnData['Churn'].value_counts()
counts = counts[0]
yes = churnData[churnData['Churn']=='Yes'].sample(counts, replace=True)
no = churnData[churnData['Churn']=='No']
data = pd.concat([yes,no], axis=0)
data['Churn'].value_counts()
X = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
classification = LogisticRegression(random_state=0, solver='lbfgs',
multi_class='ovr').fit(X, data['Churn'])
classification.score(X, data['Churn'])
# Undersampling
# technique to balance uneven datasets by keeping all of the data in the minority class and decreasing the size of the majority class.
0 90569
1 4843
rus = RandomUnderSampler()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_rus, y_rus = rus.fit_resample(X, y)
y.value_counts()
y_rus.value_counts()
0 4843
1 4843
transformer = StandardScaler().fit(X_rus)
X = transformer.transform(X_rus)
classification = LogisticRegression(random_state=0, solver='lbfgs',
multi_class='ovr').fit(X, y_rus)
classification.score(X, y_rus)
# Oversampling
# when the amount of data collected is insufficient.
ros = RandomOverSampler()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_ros, y_ros = ros.fit_resample(X, y)
y.value_counts()
y_ros.value_counts()
transformer = StandardScaler().fit(X_ros)
X = transformer.transform(X_ros)
classification = LogisticRegression(random_state=0, solver='lbfgs',
multi_class='ovr').fit(X, y_ros)
classification.score(X, y_ros)
# Synethetic Minority Oversampling Technique (SMOTE)
smote = SMOTE()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()
# Downsampling
# still reducing data, but using a defined faction
category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]
category_0_new = category_0.sample(len(category_1)*2)
0 9686
1 4843