Variance Threshold


# Variance Threshold is a feature selector that removes all the low variance features from the dataset that are of no great use in modeling. 
# It looks only at the features (x), not the desired outputs (y), and can thus be used for unsupervised learning.

**# VARIANCE THRESHOLD EXAMPLE:**

from sklearn.feature_selection import VarianceThreshold
from sklearn import preprocessing

sel = VarianceThreshold(threshold = (0.00000001))
sel = sel.fit(numerical2)
sel

pd.DataFrame(sel.transform(numerical2))

# Checking if the columns values is greater than 0.00000001 = threshold value
sel.variances_ > 0.00000001

removed_columns = pd.DataFrame(data = {'column_name': numerical2.columns,
                                       'variance': sel.variances_,
                                       'statement': sel.get_support()})

removed_columns.loc[removed_columns['statement'] == False, :]

X = numerical
y = targets['TARGET_B']

from sklearn.feature_selection import SelectKBest, chi2

kbest = SelectKBest(chi2, k=10).fit_transform(X, y)

# Here we chose 10 so that is easier to analyze results later, as we will see
selected = pd.DataFrame(kbest)
selected.head()

# To check the scores
model = SelectKBest(chi2, k = 10).fit(X, y)
df = pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns

# Sorting data
df.sort_values(by = ['score'], ascending = False).head(20)

# Just to check the columns, we can use the following code
cols = df.sort_values(by = ['score'], ascending = False).head(10)['Column']
print(cols)

319    CONTROLN
144         IC5
87          HV1
88          HV2
3         TCODE
137         MSA
17       POP901
141         IC2
18       POP902
307    RAMNTALL
Name: Column, dtype: object