一、selectkbest原理
二、代码原理
import numpy as np
from sklearn import datasets
from sklearn.feature_selection import SelectKBest
# 指标
# 对应着数学公式
from sklearn.feature_selection import chi2,variance_threshold,
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
X,y = datasets.load_wine(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 911)
X.shape#13个属性
(178, 13)
import warnings
warnings.filterwarnings('ignore')
lr = LogisticRegression()#一招LR打天下
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
1.0
chi2()
sk = SelectKBest(score_func=chi2,k=10)
X2 = sk.fit_transform(X,y)
X_train,X_test,y_train,y_test = train_test_split(X2,y,test_size = 0.2,random_state = 911)
X.shape#13个属性
# 这个是特征选择后的数据
lr = LogisticRegression()#一招LR打天下
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
1.0
结论:葡萄酒数据属性有13个,其中的3个属性,不重要!
为什么进行数据降维呢?
计算资源有限的。没用的特征,拿来计算,计算额外负担
没用的属性,计算时,‘杂质‘。影响结果!
display(X[:3])
array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
1.065e+03],
[1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
1.050e+03],
[1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
1.185e+03]])
display(X2[:3])
array([[ 14.23, 1.71, 15.6 , 127. , 2.8 , 3.06, 2.29,
5.64, 3.92, 1065. ],
[ 13.2 , 1.78, 11.2 , 100. , 2.65, 2.76, 1.28,
4.38, 3.4 , 1050. ],
[ 13.16, 2.36, 18.6 , 101. , 2.8 , 3.24, 2.81,
5.68, 3.17, 1185. ]])
sp = sk.get_support()
X3 = X[:,sp]#切片操作!
X3
array([[ 14.23, 1.71, 15.6 , ..., 5.64, 3.92, 1065. ],
[ 13.2 , 1.78, 11.2 , ..., 4.38, 3.4 , 1050. ],
[ 13.16, 2.36, 18.6 , ..., 5.68, 3.17, 1185. ],
...,
[ 13.27, 4.28, 20. , ..., 10.2 , 1.56, 835. ],
[ 13.17, 2.59, 20. , ..., 9.3 , 1.62, 840. ],
[ 14.13, 4.1 , 24.5 , ..., 9.2 , 1.6 , 560. ]])
自己定义的方法(指标),方差!!!
# 自己定义的方法
def variance(X,y):
v = np.var(X,axis = 0)
return v
sk = SelectKBest(variance,k = 5)
sk.fit(X,y)
sk.transform(X)[:3]
array([[ 1.71, 15.6 , 127. , 5.64, 1065. ],
[ 1.78, 11.2 , 100. , 4.38, 1050. ],
[ 2.36, 18.6 , 101. , 5.68, 1185. ]])
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(1.24)
vt.fit_transform(X)[:3]
array([[ 1.71, 15.6 , 127. , 5.64, 1065. ],
[ 1.78, 11.2 , 100. , 4.38, 1050. ],
[ 2.36, 18.6 , 101. , 5.68, 1185. ]])
网友评论