# Variance Threshold is a feature selector that removes all the low variance features from the dataset that are of no great use in modeling.
# It looks only at the features (x), not the desired outputs (y), and can thus be used for unsupervised learning.
**# VARIANCE THRESHOLD EXAMPLE:**
from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing
sel = VarianceThreshold(threshold = (0.00000001))
sel = sel.fit(numerical2)
sel
pd.DataFrame(sel.transform(numerical2))
# Checking if the columns values is greater than 0.00000001 = threshold value
sel.variances_ > 0.00000001
removed_columns = pd.DataFrame(data = {'column_name': numerical2.columns,
'variance': sel.variances_,
'statement': sel.get_support()})
removed_columns.loc[removed_columns['statement'] == False, :]
X = numerical
y = targets['TARGET_B']
from sklearn.feature_selection import SelectKBest, chi2
kbest = SelectKBest(chi2, k=10).fit_transform(X, y)
# Here we chose 10 so that is easier to analyze results later, as we will see
selected = pd.DataFrame(kbest)
selected.head()
# To check the scores
model = SelectKBest(chi2, k = 10).fit(X, y)
df = pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns
# Sorting data
df.sort_values(by = ['score'], ascending = False).head(20)
# Just to check the columns, we can use the following code
cols = df.sort_values(by = ['score'], ascending = False).head(10)['Column']
print(cols)
319 CONTROLN
144 IC5
87 HV1
88 HV2
3 TCODE
137 MSA
17 POP901
141 IC2
18 POP902
307 RAMNTALL
Name: Column, dtype: object