**# DATA PREPROCESSING STEP BY STEP**
data.info()
# show non-Null values from database
data.describe()
np.percentile(data['IC1'], 75)
# this function gives us the value of the 3rd quartile
np.percentile(data['IC1'], 25)
#this function gives us the value of the 1st quartile
np.percentile(data['IC1'], 75) + iqr * 1.5
upper_limit = np.percentile(data['IC1'], 75) + iqr * 1.5
lower_limit = np.percentile(data['IC1'], 25) - iqr * 1.5
# process to remove the outliers
data[(data['IC1'] > lower_limit) & (data['IC1'] < upper_limit)]
# Filtering the data
sns.histplot(data['IC1'])
plt.show()
# Histgram to show if the data is normal distribute
**# Logarithmic Function / Transforming Outliers**
sns.hnp.log(data['HV1'])
# function, instead of removing the outliers, the function compressed it
stats.boxcox(data['HV1'])
plt.show()
# this function gives us the boxcox of the data, if the data is not positive
# we need clean with the next steps
len(data[data['HV1'] <= 0])
# check how many row's we have below or equal to 0 in order to boxcox to work
np.where(data['HV1'] <= 0, 0, data['HV1'])
# with this function, its gonna run all the row's,
# replacing the column HV1 with below or equal to 0
data['HV1_temp'] = np.where(data['HV1'] <= 0, 0, data['HV1'])
# here we set a new column for the applied column
data['HV1'].mean()
data['HV1_temp'].replace(0, data['HV1'].mean())
# function of the mean
data['HV1_temp'] = data['HV1_temp'].replace(0, data['HV1'].mean())
# here we assign the column variable and replace it with the positive values
arr, lmbda = stats.boxcox(data['HV1_temp'])
#here we can define multiple variable's
# the 2nd value which is negative, it's assign as the variable lmbda
#but we only need to workout the variable arr
data['HV1_log'].value_counts()
# function the check -inf = infinite
def replace_inf(i):
if np.isfinite(i):
return i
else:
return np.NAN
# function we run to check the -inf(infinite) values and print into NaN values
list(map(replace_inf, data['HV1_log']))
# here we create a list of the replace function on the column HV1_log
data['HV1_log'].fillna(data['HV1_log'].mean())
# because now we have Null Values,
# we fill up with the function .fillna(), and replace it with the .mean()
MinMaxScaler().fit(data).transform(data)
# we use the library MinMaxScaler, use the .fit() to fit the data into the library
# and then tranformed to in between 0 and 1
pd.DataFrame(MinMaxScaler().fit(data).transform(data))
# create a dataframe with the previous function
pd.DataFrame(StandardScaler().fit(data).transform(data))
# function we standarize the scale and turn Z score
# (the same as the standard normal distribution)
**# Modelling**
X = data.drop(['TARGET_D'],axis=1)
Y = data['TARGET_D']
model = sm.OLS(Y,X).fit()
LabelEncoder().fit(data['Nationality']).transform(data['Nationality'])
# with this function we are labeling numerical from the the column 'Nationality'
OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality']))
OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality'])).toarray()
pd.DataFrame(OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality'])).toarray())
one_hot = pd.DataFrame(OneHotEncoder().fit(pd.DataFrame(data['Nationality'])).transform(pd.DataFrame(data['Nationality'])).toarray())
# One Hot encoder
numerical_x.head()
one_hot.head()
# Sparse Data
np.concatenate((numerical_x,one_hot), axis=1)
# here with put together the columns of the one_hot and create the array's
pd.DataFrame(np.concatenate((numerical_x,one_hot), axis=1))
# here we create a dataframe with the 2 variable's and concatenate into the dataframe
X = pd.DataFrame(np.concatenate((numerical_x,one_hot), axis=1))
Y = data['TARGET_D'].reset_index(drop=True)
train_test_split(X,Y,test_size=0.4,random_state=100)
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.4,random_state=100)
# here we assign the variable's to the training test, with the parameters X, Y, the test_size and the random_state
# test_size = percentage im gonna give to my module to test it (the value of it is standarize)
# when we re-run the the model OLS, its always gonna give a diferent result
model = sm.OLS(y_train,x_train).fit()
# here we run the model with the variable's y_train and x_train
predictions = model.predict(x_test)
predictions
# here we check the predictions for the Y value
r2_score(y_test, predictions)
# here we compare the 2 variable's and the result is the final test of our predictions module
X = pd.read_csv('final_data.csv')
Y = pd.read_csv('final_Y.csv')
X_train, X_test, y_train, y_test =train_test_split(X,Y,test_size=0.4,random_state=100)
model = sm.OLS(y_train,X_train).fit()
predictions = model.predict(X_test)
# Set up variables to create the model result's
r2_score(y_test,predictions)
# r2 score. = coefficient of determination
# r2 score is 40% of the variation of (y) can be explain by (X)
mse = mean_squared_error(y_test,predictions)
print(mse)
# Mean squared error
rmse = math.sqrt(mse)
print(rmse)
# Root mean squared error
R-Squared Formula