目录

预处理3-连续值与特征选取

前言

使用的相关数据链接如下:

点我跳转

代码

连续值预处理

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
import pandas as pd
data = pd.read_csv("Narrativedata.csv"
                   ,index_col=0
                  )#index_col=0将第0列作为索引,不写则认为第0列为特征
data.loc[:,"Age"] = data.loc[:,"Age"].fillna(data.loc[:,"Age"].median())

#将年龄二值化 

data_2 = data.copy()
data_2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       891 non-null    float64
 1   Sex       891 non-null    object 
 2   Embarked  889 non-null    object 
 3   Survived  891 non-null    object 
dtypes: float64(1), object(3)
memory usage: 34.8+ KB
1
2
3
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)               #类为特征专用,所以不能使用一维数组
transformer = Binarizer(threshold=30).fit_transform(X)  #阈值threshold=n, 小于等于n的数值转为0, 大于n的数值转为1
1
2
data_2.iloc[:,0] = transformer
data_2.head()

Age Sex Embarked Survived
0 0.0 male S No
1 1.0 female C Yes
2 0.0 female S Yes
3 1.0 female S Yes
4 1.0 male S No
1
2
from sklearn.preprocessing import KBinsDiscretizer
X = data.iloc[:,0].values.reshape(-1,1)
1
2
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit_transform(X)
1
2
#查看转换后分的箱:变成了一列中的三箱
set(est.fit_transform(X).ravel())
{0.0, 1.0, 2.0}
1
2
3
4
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')

#查看转换后分的箱:变成了哑变量
est.fit_transform(X).toarray()
array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

特征选取

1
2
import pandas as pd
data = pd.read_csv("digit recognizor.csv")
1
2
X = data.iloc[:,1:]
y = data.iloc[:,0]
1
X.shape
(42000, 784)

Fliter过滤法

方差过滤

1
2
3
4
5
"""
这个数据量相对夸张,如果使用支持向量机和神经网络,很可能会直接跑不出来。使用KNN跑一次大概需要半个小时。
用这个数据举例,能更够体现特征工程的重要性。
"""
from sklearn.feature_selection import VarianceThreshold
1
selector = VarianceThreshold() #实例化,不填参数默认方差为0
1
X_var0 = selector.fit_transform(X) #获取删除不合格特征之后的新特征矩阵,删除方差为0的特征
1
2
#也可以直接写成 X = VairanceThreshold().fit_transform(X)
X_var0.shape
(42000, 708)
1
2
3
4
5
6
#我们希望留下一半的特征,
#那可以设定一个让特征总数减半的方差阈值,只要找到特征方差的中位数,再将这个中位数作为参数threshold的值输入就好了
import numpy as np
#X.var() 读取每一列的方差
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
#X.var().values
1
np.median(X.var().values)
1352.2867031797243
1
X_fsvar.shape
(42000, 392)
1
2
3
#若特征是伯努利随机变量,假设p=0.8,即二分类特征中某种分类占到80%以上的时候删除特征
X_bvar = VarianceThreshold(.8 * (1 - .8)).fit_transform(X)
X_bvar.shape
(42000, 685)

相关性过滤

1
2
3
4
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
1
2
3
#再结合feature_selection.SelectKBest这个可以输入”评分标准“来选出前K个分数最高的特征的类,
X_fschi = SelectKBest(chi2,k=300).fit_transform(X_fsvar,y)#评分标准是卡方chi2
X_fschi.shape
(42000, 300)
1
cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()
0.9344761904761905
1
2
#python中的魔法命令,可以直接使用%%timeit来计算运行这个cell中的代码所需的时间
#为了计算所需的时间,需要将这个cell中的代码运行很多次(通常是7次)后求平均值,因此运行%%timeit的时间会远远超过cell中的代码单独运行的时间
1
2
3
X = data.iloc[:,1:]
y = data.iloc[:,0]
X_fsvar = VarianceThreshold(np.median(X.var().values)).fit_transform(X)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
#======【TIME WARNING: 5 mins】======#
%matplotlib inline
import matplotlib.pyplot as plt
score = []
for i in range(390,200,-10):
    X_fschi = SelectKBest(chi2, k=i).fit_transform(X_fsvar, y)
    once = cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()
    score.append(once)
plt.plot(range(350,200,-10),score)
#plt.show()

卡方值越大越好,p值小于0.05时是数据相关

1
2
chivalue, pvalues_chi = chi2(X_fsvar,y)
#chivalue
1
#pvalues_chi
1
2
3
4
#k取多少?我们想要消除所有p值大于设定值,比如0.05或0.01的特征:
k = chivalue.shape[0] - (pvalues_chi > 0.05).sum()
#X_fschi = SelectKBest(chi2, k=填写具体的k).fit_transform(X_fsvar, y)
#cross_val_score(RFC(n_estimators=10,random_state=0),X_fschi,y,cv=5).mean()
1
k
392
1
2
3
4
5
from sklearn.feature_selection import f_classif

F,pvalues_f = f_classif(X_fsvar,y)

#F
1
#pvalues_f
1
k = F.shape[0] - (pvalues_f > 0.05).sum()
1
k
392
1
2
#X_fsF = SelectKBest(f_classif, k=填写具体的k).fit_transform(X_fsvar, y)
#cross_val_score(RFC(n_estimators=10,random_state=0),X_fsF,y,cv=5).mean()
1
2
3
from sklearn.feature_selection import mutual_info_classif as MIC
result = MIC(X_fsvar,y)
k = result.shape[0] - sum(result <= 0)
1
k
392
1
2
#X_fsmic = SelectKBest(MIC, k=填写具体的k).fit_transform(X_fsvar, y)
#cross_val_score(RFC(n_estimators=10,random_state=0),X_fsmic,y,cv=5).mean()

嵌入法

1
2
3
4
5
6
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as RFC
RFC_ = RFC(n_estimators =10,random_state=0)
X_embedded = SelectFromModel(RFC_,threshold=0.005).fit_transform(X,y)
#在这里我只想取出来有限的特征。0.005这个阈值对于有780个特征的数据来说,是非常高的阈值,因为平均每个特征只能够分到大约0.001的feature_importances_
X_embedded.shape
(42000, 47)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
#模型的维度明显被降低了
#同样的,我们也可以画学习曲线来找最佳阈值
#======【TIME WARNING:10 mins】======#
import numpy as np
import matplotlib.pyplot as plt
RFC_.fit(X,y).feature_importances_
threshold = np.linspace(0,(RFC_.fit(X,y).feature_importances_).max(),20)
score = []
for i in threshold:
    X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,y)
    once = cross_val_score(RFC_,X_embedded,y,cv=5).mean()
    score.append(once)
plt.plot(threshold,score)
plt.show()

https://gitee.com/spiritlhl/picture/raw/master/output_35_0.png

1
2
3
#上述图找到合适的threshold区间
X_embedded = SelectFromModel(RFC_,threshold=0.00067).fit_transform(X,y)
X_embedded.shape
(42000, 324)
1
cross_val_score(RFC_,X_embedded,y,cv=5).mean()
0.9391190476190475
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
#======【TIME WARNING:10 mins】======#
#区间内再找合适的值
score2 = []
for i in np.linspace(0,0.00134,20):
    X_embedded = SelectFromModel(RFC_,threshold=i).fit_transform(X,y)
    once = cross_val_score(RFC_,X_embedded,y,cv=5).mean()
    score2.append(once)
plt.figure(figsize=[20,5])
plt.plot(np.linspace(0,0.00134,20),score2)
plt.xticks(np.linspace(0,0.00134,20))
plt.show()

https://gitee.com/spiritlhl/picture/raw/master/output_38_0.png

1
2
X_embedded = SelectFromModel(RFC_,threshold=0.000564).fit_transform(X,y)
X_embedded.shape
(42000, 340)
1
cross_val_score(RFC_,X_embedded,y,cv=5).mean()
0.9392857142857144
1
2
3
#=====【TIME WARNING:2 min】=====#
#我们可能已经找到了现有模型下的最佳结果,如果我们调整一下随机森林的参数呢?
cross_val_score(RFC(n_estimators=100,random_state=0),X_embedded,y,cv=5).mean()
0.9634285714285715

包装法-递归特征消除法

1
2
3
from sklearn.feature_selection import RFE
RFC_ = RFC(n_estimators =10,random_state=0)
selector = RFE(RFC_, n_features_to_select=340, step=50).fit(X, y)
1
selector.support_.sum()
340
1
#selector.ranking_
1
2
X_wrapper = selector.transform(X)
cross_val_score(RFC_,X_wrapper,y,cv=5).mean()
0.9379761904761905
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
#======【TIME WARNING: 15 mins】======#
score = []
for i in range(1,751,50):
    X_wrapper = RFE(RFC_,n_features_to_select=i, step=50).fit_transform(X,y)
    once = cross_val_score(RFC_,X_wrapper,y,cv=5).mean()
    score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(1,751,50),score)
plt.xticks(range(1,751,50))
plt.show()

https://gitee.com/spiritlhl/picture/raw/master/output_47_0.png