Beinsearch
7/29/2017 - 12:55 PM

使用K方检验选择最好的k个特征.py

参考:
http://blog.csdn.net/Bryan__/article/details/51607215
https://tianchi.aliyun.com/forum/new_articleDetail.html?spm=5176.8366600.0.0.2716311fCC1grl&raceId=231633&postsId=4043
https://tianchi.aliyun.com/forum/new_articleDetail.html?spm=5176.8366600.0.0.2716311fCC1grl&raceId=231633&postsId=4020

1、目的 
去除异常值和噪声;
对不平衡数据,重采样;
组合特征处理,如多项式、X/Y;
特征选择:Filter Wrapper Embedded

2、过滤型(Fillter)
评估单个特征和结果值之间的相关程度,排序留下Top相关的特征部分。
缺点:没有考虑特征之间的关联作用,可能排除有用的关联特征。 
  Person相关系数法:
  import numpy as np
  from scipy.stats import pearsonr
  print ("pearsonr:", pearsonr(x, y))
  缺点:对非线性关系,即使一一对应,结果也接近0。
  
  互信息和最大信息系数MIC:
  import numpy as np
  from minepy import MINE 
  m = MINE()
  x = np.random.uniform(-1, 1, 10000)
  m.compute_score(x, x**2)
  print (m.mic())

3、包裹型(Wrapper)
递归特征消除 Recursive feature elimination (RFE)
① 用全量特征跑一个模型 
② 根据线性模型的系数,删掉5-10%的弱特征,观察准确率/auc的变化 
③ 逐步进行,直至准确率/auc出现大的下滑停止 

基于学习模型的特征排序 (Model based ranking)
直接使用你要用的机器学习算法,针对每个单独的特征和响应变量建立预测模型。基于树的方法易于使用,树的深度最好不要太大,同时运用交叉验证。
from sklearn.linear_model import RandomizedLasso  
from sklearn.datasets import load_boston  
boston = load_boston()  
  
#using the Boston housing data.   
#Data gets scaled automatically by sklearn's implementation  
X = boston["data"]  
Y = boston["target"]  
names = boston["feature_names"]  
  
rlasso = RandomizedLasso(alpha=0.025)  
rlasso.fit(X, Y)  
  
print ("Features sorted by their score:")  
print (sorted(zip(map(lambda x: round(x, 4), rlasso.scores_),   
                 names), reverse=True))
#Features sorted by their score: [(1.0, ‘RM’), (1.0, ‘PTRATIO’), (1.0, ‘LSTAT’), (0.62, ‘CHAS’), (0.595, ‘B’), (0.39, ‘TAX’), (0.385, ‘CRIM’), (0.25, ‘DIS’), (0.22, ‘NOX’), (0.125, ‘INDUS’), (0.045, ‘ZN’), (0.02, ‘RAD’), (0.015, ‘AGE’)]

#稳定性选择对于克服过拟合和对数据理解来说都是有帮助的:好的特征不会因为有相似的特征、关联特征而得分为0,这跟Lasso不同。
from sklearn.datasets import load_boston  
from sklearn.ensemble import RandomForestRegressor  
import numpy as np  
#Load boston housing dataset as an example  
boston = load_boston()  
X = boston["data"]  
Y = boston["target"]  
names = boston["feature_names"]  
rf = RandomForestRegressor()  
rf.fit(X, Y)  
print ("Features sorted by their score:")  
print (sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names),   
             reverse=True)) 

#1、这种方法存在偏向,对具有更多类别的变量会更有利;
#2、对于存在关联的多个特征,其中任意一个都可以作为指示器(优秀的特征),
#   并且一旦某个特征被选择之后,其他特征的重要度就会急剧下降,因为不纯度已经被选中的那个特征降下来了;
#3、关联特征的打分存在不稳定的现象,这不仅仅是随机森林特有的,大多数基于模型的特征选择方法都存在这个问题。
from sklearn.linear_model import Ridge  
from sklearn.metrics import r2_score  
size = 100  
  
#We run the method 10 times with different random seeds  
for i in range(10):  
    print "Random seed %s" % i  
    np.random.seed(seed=i)  
    X_seed = np.random.normal(0, 1, size)  
    X1 = X_seed + np.random.normal(0, .1, size)  
    X2 = X_seed + np.random.normal(0, .1, size)  
    X3 = X_seed + np.random.normal(0, .1, size)  
    Y = X1 + X2 + X3 + np.random.normal(0, 1, size)  
    X = np.array([X1, X2, X3]).T  
  
  
    lr = LinearRegression()  
    lr.fit(X,Y)  
    print "Linear model:", pretty_print_linear(lr.coef_)  
  
    ridge = Ridge(alpha=10)  
    ridge.fit(X,Y)  
    print ("Ridge model:", pretty_print_linear(ridge.coef_))
from sklearn.linear_model import Lasso  
from sklearn.preprocessing import StandardScaler  
from sklearn.datasets import load_boston  
  
boston = load_boston()  
scaler = StandardScaler()  
X = scaler.fit_transform(boston["data"])  
Y = boston["target"]  
names = boston["feature_names"]  

def pretty_print_linear(coefs, names = None, sort = False):  
    if names == None:  
        names = ["X%s" % x for x in range(len(coefs))]  
    lst = zip(coefs, names)  
    if sort:  
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))  
    return " + ".join("%s * %s" % (round(coef, 3), name)  
                                   for coef, name in lst)
  
lasso = Lasso(alpha=.3)  
lasso.fit(X, Y)  
  
print ("Lasso model: ", pretty_print_linear(lasso.coef_, names, sort = True))

#Lasso model: -3.707 * LSTAT + 2.992 * RM + -1.757 * PTRATIO + -1.081 * DIS + -0.7 * NOX + 0.631 * B + 0.54 * CHAS + -0.236 * CRIM + 0.081 * ZN + -0.0 * INDUS + -0.0 * AGE + 0.0 * RAD + -0.0 * TAX
from sklearn.feature_selection import RFE  
from sklearn.linear_model import LinearRegression  
  
boston = load_boston()  
X = boston["data"]  
Y = boston["target"]  
names = boston["feature_names"]  
  
#use linear regression as the model  
lr = LinearRegression()  
#rank all features, i.e continue the elimination until the last one  
rfe = RFE(lr, n_features_to_select=1)  
rfe.fit(X,Y)  
  
print ("Features sorted by their rank:")  
print (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))  
#[(1.0, 'NOX'), (2.0, 'RM'), (3.0, 'CHAS'), (4.0, 'PTRATIO'), (5.0, 'DIS'), (6.0, 'LSTAT'), (7.0, 'RAD'), 
#(8.0, 'CRIM'), (9.0, 'INDUS'), (10.0, 'ZN'), (11.0, 'TAX'), (12.0, 'B'), (13.0, 'AGE')]
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.datasets import load_boston#波士顿房屋价格预测
from sklearn.ensemble import RandomForestRegressor
#集成学习ensemble库中的随机森林回归RandomForestRegressor

#Load boston housing dataset as an example
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]

rf = RandomForestRegressor(n_estimators=20, max_depth=4)
#20个弱分类器,深度为4
scores = []
for i in range(X.shape[1]):#分别让每个特征与响应变量做模型分析并得到误差率
     score = cross_val_score(rf, X[:, i:i+1], Y, scoring="r2",
                              cv=ShuffleSplit(len(X), 3, .3))
     scores.append((round(np.mean(score), 3), names[i]))
print (sorted(scores, reverse=True))#对每个特征的分数排序
#[(0.603, 'LSTAT'), (0.599, 'RM'), (0.426, 'NOX'), (0.352, 'INDUS'), (0.307, 'TAX'), (0.296, 'PTRATIO'), (0.206, 'ZN'), (0.196, 'CRIM'), (0.192, 'RAD'), (0.138, 'B'), (0.04, 'DIS'), (0.004, 'AGE'), (-0.034, 'CHAS')]
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape
#(150, 4)
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape
#(150, 2)