**# DATA PREPROCESSING STEP BY STEP**

data.info() 
# show non-Null values from database

data.describe()

np.percentile(data['IC1'], 75) 
# this function gives us the value of the 3rd quartile

np.percentile(data['IC1'], 25)
#this function gives us the value of the 1st quartile

np.percentile(data['IC1'], 75) + iqr * 1.5
upper_limit = np.percentile(data['IC1'], 75) + iqr * 1.5
lower_limit = np.percentile(data['IC1'], 25) - iqr * 1.5
# process to remove the outliers

data[(data['IC1'] > lower_limit) & (data['IC1'] < upper_limit)]
# Filtering the data

sns.histplot(data['IC1'])
plt.show()
# Histgram to show if the data is normal distribute

**# Logarithmic Function / Transforming Outliers**

sns.hnp.log(data['HV1'])
# function, instead of removing the outliers, the function compressed it

stats.boxcox(data['HV1'])
plt.show()
# this function gives us the boxcox of the data, if the data is not positive
# we need clean with the next steps

len(data[data['HV1'] <= 0])
# check how many row's we have below or equal to 0 in order to boxcox to work

np.where(data['HV1'] <= 0, 0, data['HV1'])
# with this function, its gonna run all the row's, 
# replacing the column HV1 with below or equal to 0

data['HV1_temp'] = np.where(data['HV1'] <= 0, 0, data['HV1'])
# here we set a new column for the applied column

data['HV1'].mean()
data['HV1_temp'].replace(0, data['HV1'].mean())
# function of the mean

data['HV1_temp'] = data['HV1_temp'].replace(0, data['HV1'].mean())
# here we assign the column variable and replace it with the positive values

arr, lmbda = stats.boxcox(data['HV1_temp'])
#here we can define multiple variable's
# the 2nd value which is negative, it's assign as the variable lmbda
#but we only need to workout the variable arr

data['HV1_log'].value_counts()
# function the check -inf = infinite

def replace_inf(i):
    if np.isfinite(i):
        return i
    else:
        return np.NAN
# function we run to check the -inf(infinite) values and print into NaN values

list(map(replace_inf, data['HV1_log']))
# here we create a list of the replace function on the column HV1_log

data['HV1_log'].fillna(data['HV1_log'].mean())
# because now we have Null Values, 
# we fill up with the function .fillna(), and replace it with the .mean()

MinMaxScaler().fit(data).transform(data)
# we use the library MinMaxScaler, use the .fit() to fit the data into the library 
# and then tranformed to in between 0 and 1

pd.DataFrame(MinMaxScaler().fit(data).transform(data))
# create a dataframe with the previous function

pd.DataFrame(StandardScaler().fit(data).transform(data))
# function we standarize the scale and turn Z score 
# (the same as the standard normal distribution)

**# Modelling**

X = data.drop(['TARGET_D'],axis=1)
Y = data['TARGET_D']
model = sm.OLS(Y,X).fit()

LabelEncoder().fit(data['Nationality']).transform(data['Nationality'])
# with this function we are labeling numerical from the the column 'Nationality'

OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality']))
OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality'])).toarray()
pd.DataFrame(OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality'])).toarray())
one_hot = pd.DataFrame(OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality'])).toarray())
# One Hot encoder

numerical_x.head()
one_hot.head()
# Sparse Data

np.concatenate((numerical_x,one_hot), axis=1)
# here with put together the columns of the one_hot and create the array's

pd.DataFrame(np.concatenate((numerical_x,one_hot), axis=1))
# here we create a dataframe with the 2 variable's and concatenate into the dataframe

X = pd.DataFrame(np.concatenate((numerical_x,one_hot), axis=1))
Y = data['TARGET_D'].reset_index(drop=True)

train_test_split(X,Y,test_size=0.4,random_state=100)

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.4,random_state=100)
# here we assign the variable's to the training test, with the parameters X, Y, the test_size and the random_state
# test_size = percentage im gonna give to my module to test it (the value of it is standarize)
# when we re-run the the model OLS, its always gonna give a diferent result

model = sm.OLS(y_train,x_train).fit()
# here we run the model with the variable's y_train and x_train

predictions = model.predict(x_test)
predictions
# here we check the predictions for the Y value

r2_score(y_test, predictions)
# here we compare the 2 variable's and the result is the final test of our predictions module

X = pd.read_csv('final_data.csv')
Y = pd.read_csv('final_Y.csv') 
X_train, X_test, y_train, y_test =train_test_split(X,Y,test_size=0.4,random_state=100)

model = sm.OLS(y_train,X_train).fit()
predictions = model.predict(X_test)
# Set up variables to create the model result's

r2_score(y_test,predictions)
# r2 score. = coefficient of determination
# r2 score is 40% of the variation of (y) can be explain by (X)

mse = mean_squared_error(y_test,predictions)
print(mse)
# Mean squared error

rmse = math.sqrt(mse)
print(rmse)
# Root mean squared error

R-Squared Formula