Logistic Regression

**# Example of Logistic Regression:**

import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import Normalizer
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
!pip install imblearn
from imblearn.over_sampling import SMOTE
#Importing libraries

password = getpass.getpass()

connection_string = 'mysql+pymysql://root:' + password + '@localhost/bank'
engine = create_engine(connection_string)
query = '''
        with client_summary as (
select
        account_id,
        avg(amount) as avg_trans_amount,
        count(trans_id) as trans_count,
        avg(balance) as avg_balance
    from trans
    group by 1
)

select 
    c.client_id,
    left(c.birth_number, 2) as age,
    c.district_id,
    a.frequency,
    convert(a.date,date) as account_start_date,
    convert(left(cd.issued,6),date) as card_issued_date,
    convert(l.date,date) as loan_start_date,
    datediff(convert(l.date,date),convert(a.date,date)) as days_between,
    cs.avg_trans_amount,
    cs.avg_balance,
    cs.trans_count,
    cd.type as card_type,
    l.amount as loan_amount,
    l.duration as loan_duration,
    l.payments as loan_payments,
    l.status as loan_status
from client c
join disp d on c.client_id = d.client_id
join account a on d.account_id = a.account_id
join loan l on l.account_id = a.account_id
left join card cd on cd.disp_id = d.disp_id
join client_summary cs on cs.account_id = a.account_id
'''
#Here we are exporting a query from sql bank schema to create the module

data = pd.read_sql_query(query, engine)
data.head()

# loan_status = (Y = predict)
# A: Contract finished, no problems.
# B: Contract finished, loan not paid.
# C: Running contract, okey so far.
# D: Running contract, client in debt.

data['loan_status'].value_counts()
#Checking how many values on the column 'loan_status'

data.shape
# 827 Row's
# 16 Columns

data.dtypes
# Checking the types of the columns

data.isna().sum()
# Checking if there's any Null Values

data = data.drop(['card_issued_date','card_type'],axis=1)
#Droping the columns with Null Values

data['age'] = data['age'].astype('int')
#Changing the column type 'age' a 'int'

data['account_start_date'] = pd.to_datetime(data['account_start_date'])
#Changing the column 'account_start_date' to Datetime format

data['loan_start_date'] = pd.to_datetime(data['loan_start_date'])
#Changing the column 'loan_start_date' to Datetime format

data['age'] = data['age'].astype('int')
data['account_start_date'] = pd.to_datetime(data['account_start_date'])
data['loan_start_date'] = pd.to_datetime(data['loan_start_date'])

data['account_start_date'] = data['account_start_date'].map(dt.datetime.toordinal)
data['loan_start_date'] = pd.to_datetime(data['loan_start_date']).map(dt.datetime.toordinal)
# in here we ordering with "map" function

data['district_id'] = data['district_id'].astype('object')
#Changing the type of the column 'district_id' as an object

data = data.drop('client_id',axis=1)
#Droping the column 'client_id' being irrelevant for our module

data.dtypes
#Checking again our changes

data.isna().sum()
#Checking again the cleaning of our Null Values

corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, annot_kws={"fontsize":8})
plt.show()
#Showing the Heatmap to see the High Correlation
#On the map we see that 'account_start_date' and 'loan_start_date',
#are High correlated to each other and need to be dropped

data = data.drop(['account_start_date','loan_start_date'],axis=True)
#here we are dropping the column's 'account_start_date' and 'loan_start_date'
#because it is highly correlated

sns.histplot(data['avg_trans_amount'])
plt.show()
sns.histplot(data['avg_balance'])
plt.show()
sns.histplot(data['days_between'])
plt.show()
sns.histplot(data['trans_count'])
plt.show()
sns.histplot(x['avg_trans_amount'])
plt.show()
sns.pairplot(data)
plt.show()

X = data.select_dtypes(include = np.number)
X.head()
#Filtering all the numerical values in our Data

transformer = Normalizer().fit(X)
x_normalized = transformer.transform(X)
x = pd.DataFrame(x_normalized)
x.columns = X.columns
x.head()
#fitting the normalizer to our data
#creating a variable transformer to Normalize our DataFrame
#setting up our variable's

cat = data.select_dtypes(include = object)
cat
#here we check the dtypes on our data for the categorical's

cat = cat.drop(['loan_status'],axis=1)
#here we drop the column 'loan_status' to ensure the accuracy of our module

categorical = pd.get_dummies(cat,columns=['district_id','frequency'])
categorical
#here we aplly with 'pd.get_dummies' as label encoding

X = np.concatenate((X,categorical),axis=1)
y = data['loan_status']
#Setting up our X and Y for our module and for the test data

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4)
#Setting up the X and y variable's and prepare for the train test

classification = LogisticRegression().fit(X_train,y_train)
#Here we apply from the library 'LogisticRegression()' to fit the X and y

predictions = classification.predict(X_test)
predictions
#Setting up the predictions variable for the X_test predict

classification.score(X_test,y_test)
#Here we show the r_score

y_test.value_counts()
#Here we check again for the value's to see our module test predictions

pd.Series(predictions).value_counts()
#Here we check again for the value's to see our module test predictions

confusion_matrix(y_test, predictions)

# Accuracy / Imbalance

# Precision = TP / (TP(True Positive) + FP(False Positive))

# Recall = TP / (TP(True Positive) + FN(False Negative))

# Imbalanced classification refers to a classification predictive modeling problem 
# where the number of examples in the 
# training dataset for each class label is not balanced. That is, where the class 
# distribution is not equal or close to equal, 
# and is instead biased or skewed.

cm = confusion_matrix(y_test, predictions, labels=classification.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=classification.classes_)
disp.plot()
plt.show()

np.diag(cm).sum()

cm.sum()

np.diag(cm).sum() / cm.sum()

churnData = pd.read_csv('customer_churn.csv')
churnData.head(5)

churnData['Churn'].value_counts()
# As you can see there is a huge imbalance in the representation of the two categories

numericData = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
Y = pd.DataFrame(data=churnData, columns=['Churn'])
transformer = StandardScaler().fit(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(churnData[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression().fit(scaled_x, churnData['Churn'])
classification.score(scaled_x, churnData['Churn'])

yes = churnData[churnData['Churn']=='Yes']
no = churnData[churnData['Churn']=='No']
yes = yes.sample(500)

data = pd.concat([yes,no], axis=0)
print(data['Churn'].value_counts())
data.head()

numericData = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(data[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(data[['tenure','SeniorCitizen','MonthlyCharges']])
classification = LogisticRegression().fit(scaled_x, data['Churn'])
classification.score(scaled_x, data['Churn'])

counts = churnData['Churn'].value_counts()
counts = counts[0]

yes = churnData[churnData['Churn']=='Yes'].sample(counts, replace=True)

no = churnData[churnData['Churn']=='No']
data = pd.concat([yes,no], axis=0)

data['Churn'].value_counts()

X = data[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X, data['Churn'])
classification.score(X, data['Churn'])

# Undersampling

# technique to balance uneven datasets by keeping all of the data in the minority class and decreasing the size of the majority class.

0    90569
1     4843

rus = RandomUnderSampler()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_rus, y_rus = rus.fit_resample(X, y)

y.value_counts()
y_rus.value_counts()

0    4843
1    4843

transformer = StandardScaler().fit(X_rus)
X = transformer.transform(X_rus)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X, y_rus)
classification.score(X, y_rus)

# Oversampling

# when the amount of data collected is insufficient.

ros = RandomOverSampler()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_ros, y_ros = ros.fit_resample(X, y)

y.value_counts()
y_ros.value_counts()

transformer = StandardScaler().fit(X_ros)
X = transformer.transform(X_ros)
classification = LogisticRegression(random_state=0, solver='lbfgs',
                        multi_class='ovr').fit(X, y_ros)
classification.score(X, y_ros)

# Synethetic Minority Oversampling Technique (SMOTE)

smote = SMOTE()
X = churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData['Churn']
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

# Downsampling

# still reducing data, but using a defined faction

category_0 = data[data['TARGET_B'] == 0]
category_1 = data[data['TARGET_B'] == 1]

category_0_new = category_0.sample(len(category_1)*2)

0    9686
1    4843