Data Cleaning 1.0

100 filtered[1:4] #looks at the position of the index filtered.iloc[1:4] #iloc looks at the values of the index rows = [0,1] cols = ['id'] filtered.loc[rows,cols] #loc looks at the specifics labels(rows and columns) data._get_numeric_data() # all the numeric data in our data fram"> 100 filtered[1:4] #looks at the position of the index filtered.iloc[1:4] #iloc looks at the values of the index rows = [0,1] cols = ['id'] filtered.loc[rows,cols] #loc looks at the specifics labels(rows and columns) data._get_numeric_data() # all the numeric data in our data fram"> 100 filtered[1:4] #looks at the position of the index filtered.iloc[1:4] #iloc looks at the values of the index rows = [0,1] cols = ['id'] filtered.loc[rows,cols] #loc looks at the specifics labels(rows and columns) data._get_numeric_data() # all the numeric data in our data fram">


**# DATA CLEANING TOOLS:

# Data Cleaning Steps:
# Null Values
# Knowledge Importance
# Duplicate Data
# Multi Collinearity
# Variance Threshold
# K-Best
# Recursive Feature Elimination**

data = pd.read_csv('filename.csv')
data
# import the data file

data = pd.concat([file1,file2])
# concatenate the 2 data files

.head()
.tail()
.columns
.shape
.dtypes

cols = []
for column in range(len(data.columns)):
    cols.append(data.columns[column].lower())
data.columns = cols
# function to standerize all the name columns with lowercase

data.drop(['tcode'],axis=1,inplace=True)
#inplace is a permanent to delete the column"1" tcode on the variable

data[data['gender'].isin(['M','F'])]
# filtering

data['target_d'] > 100

filtered[1:4]
#looks at the position of the index

filtered.iloc[1:4]
#iloc looks at the values of the index

rows = [0,1]
cols = ['id']
filtered.loc[rows,cols]
#loc looks at the specifics labels(rows and columns)

data._get_numeric_data()
# all the numeric data in our data frames

data.select_dtypes('object')
# this function gets all the non-numeric data

pd.to_numeric(data['median_home_val'], errors='coerce')
# function to pass non-numeric data to numeric data with the 
# parameter call errors "coerce = replace error with null value
# and != 0", NaN = Null

# Checking the NuLL Values by creating a Column in a DataFrame.
data = pd.DataFrame(numerical.isna().sum()/len(numerical)).reset_index()
data.columns = ['column_name', 'nulls']
data[data['nulls'] > 0].sort_values(by = 'nulls', ascending = False)