7/29/2017 - 12:55 PM

## 使用K方检验选择最好的k个特征.py

1、目的

2、过滤型(Fillter)

Person相关系数法：
import numpy as np
from scipy.stats import pearsonr
print ("pearsonr:", pearsonr(x, y))
缺点：对非线性关系，即使一一对应，结果也接近0。

互信息和最大信息系数MIC：
import numpy as np
from minepy import MINE
m = MINE()
x = np.random.uniform(-1, 1, 10000)
m.compute_score(x, x**2)
print (m.mic())

3、包裹型(Wrapper)

① 用全量特征跑一个模型
② 根据线性模型的系数，删掉5-10%的弱特征，观察准确率/auc的变化
③ 逐步进行，直至准确率/auc出现大的下滑停止

``````from sklearn.linear_model import RandomizedLasso

#using the Boston housing data.
#Data gets scaled automatically by sklearn's implementation
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]

rlasso = RandomizedLasso(alpha=0.025)
rlasso.fit(X, Y)

print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),
names), reverse=True))
#Features sorted by their score: [(1.0, ‘RM’), (1.0, ‘PTRATIO’), (1.0, ‘LSTAT’), (0.62, ‘CHAS’), (0.595, ‘B’), (0.39, ‘TAX’), (0.385, ‘CRIM’), (0.25, ‘DIS’), (0.22, ‘NOX’), (0.125, ‘INDUS’), (0.045, ‘ZN’), (0.02, ‘RAD’), (0.015, ‘AGE’)]

#稳定性选择对于克服过拟合和对数据理解来说都是有帮助的：好的特征不会因为有相似的特征、关联特征而得分为0，这跟Lasso不同。``````
``````from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor()
rf.fit(X, Y)
print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),
reverse=True))

#1、这种方法存在偏向，对具有更多类别的变量会更有利；
#2、对于存在关联的多个特征，其中任意一个都可以作为指示器（优秀的特征），
#   并且一旦某个特征被选择之后，其他特征的重要度就会急剧下降，因为不纯度已经被选中的那个特征降下来了;
#3、关联特征的打分存在不稳定的现象，这不仅仅是随机森林特有的，大多数基于模型的特征选择方法都存在这个问题。``````
``````from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
size = 100

#We run the method 10 times with different random seeds
for i in range(10):
print "Random seed %s" % i
np.random.seed(seed=i)
X_seed = np.random.normal(0, 1, size)
X1 = X_seed + np.random.normal(0, .1, size)
X2 = X_seed + np.random.normal(0, .1, size)
X3 = X_seed + np.random.normal(0, .1, size)
Y = X1 + X2 + X3 + np.random.normal(0, 1, size)
X = np.array([X1, X2, X3]).T

lr = LinearRegression()
lr.fit(X,Y)
print "Linear model:", pretty_print_linear(lr.coef_)

ridge = Ridge(alpha=10)
ridge.fit(X,Y)
print ("Ridge model:", pretty_print_linear(ridge.coef_))``````
``````from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(boston["data"])
Y = boston["target"]
names = boston["feature_names"]

def pretty_print_linear(coefs, names = None, sort = False):
if names == None:
names = ["X%s" % x for x in range(len(coefs))]
lst = zip(coefs, names)
if sort:
lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
return " + ".join("%s * %s" % (round(coef, 3), name)
for coef, name in lst)

lasso = Lasso(alpha=.3)
lasso.fit(X, Y)

print ("Lasso model: ", pretty_print_linear(lasso.coef_, names, sort = True))

#Lasso model: -3.707 * LSTAT + 2.992 * RM + -1.757 * PTRATIO + -1.081 * DIS + -0.7 * NOX + 0.631 * B + 0.54 * CHAS + -0.236 * CRIM + 0.081 * ZN + -0.0 * INDUS + -0.0 * AGE + 0.0 * RAD + -0.0 * TAX
``````
``````from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]

#use linear regression as the model
lr = LinearRegression()
#rank all features, i.e continue the elimination until the last one
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X,Y)

print ("Features sorted by their rank:")
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))
#[(1.0, 'NOX'), (2.0, 'RM'), (3.0, 'CHAS'), (4.0, 'PTRATIO'), (5.0, 'DIS'), (6.0, 'LSTAT'), (7.0, 'RAD'),
#(8.0, 'CRIM'), (9.0, 'INDUS'), (10.0, 'ZN'), (11.0, 'TAX'), (12.0, 'B'), (13.0, 'AGE')]``````
``````from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
#集成学习ensemble库中的随机森林回归RandomForestRegressor

#Load boston housing dataset as an example
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]

rf = RandomForestRegressor(n_estimators=20, max_depth=4)
#20个弱分类器，深度为4
scores = []
for i in range(X.shape[1]):#分别让每个特征与响应变量做模型分析并得到误差率
score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",
cv=ShuffleSplit(len(X), 3, .3))
scores.append((round(np.mean(score), 3), names[i]))
print (sorted(scores, reverse=True))#对每个特征的分数排序
#[(0.603, 'LSTAT'), (0.599, 'RM'), (0.426, 'NOX'), (0.352, 'INDUS'), (0.307, 'TAX'), (0.296, 'PTRATIO'), (0.206, 'ZN'), (0.196, 'CRIM'), (0.192, 'RAD'), (0.138, 'B'), (0.04, 'DIS'), (0.004, 'AGE'), (-0.034, 'CHAS')]``````
``````from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2