Beinsearch
2/11/2018 - 6:18 AM

sklearn

# 决策树
# http://blog.csdn.net/li980828298/article/details/51172744
# https://www.cnblogs.com/pinard/p/6056319.html
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

clf = DecisionTreeClassifier(criterion="entropy",
                             class_weight={0:1,1:2},
#                              max_features="log2",
                             max_depth=4, 
                             min_samples_split=200,
                             min_samples_leaf=50,
                            random_state=10) 

# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, x_train, y_train, cv=3, scoring='roc_auc') 
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
    np.mean(scores), np.std(scores)))
clf.fit(x_train, y_train)
print("Score on test:{:.4f}".format(clf.score(x_test, y_test)))
from sklearn import tree
from IPython.display import Image  
import pydotplus 
dot_data = tree.export_graphviz(clf, out_file=None, 
                                feature_names=df2.columns[:-2],
                                class_names=['fast', 'slow'],
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png()) 
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
def bulid_model(model_name):
    model = model_name()
    return model
scoring = 'neg_mean_squared_error'
for model_name in [LinearRegression,Ridge,Lasso,ElasticNet,KNeighborsRegressor,DecisionTreeRegressor,SVR]:
    model = bulid_model(model_name)
    results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    print(results.mean())
#线性回归
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(x_train, y_train)
print linreg.intercept_
print linreg.coef_
y_pred = linreg.predict(x_test)
from sklearn import metrics
# 用scikit-learn计算RMSE
print "RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred))
# 画图
fig, ax = plt.subplots()
ax.scatter(y, y_pred)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
#统计分析
houseprice['MSSubClass'].value_counts()         #统计某一列中各个元素值出现的次数
print("Skewness: %f" % houseprice['MSSubClass'].skew())    #列出数据的偏斜度
print("Kurtosis: %f" % houseprice['MSSubClass'].kurt())   #列出数据的峰度
houseprice['LotFrontage'].corr(houseprice['LotArea'])         #计算两个列的相关度
houseprice['SqrtLotArea']=np.sqrt(houseprice['LotArea'])   #将列的数值求根,并赋予一个新列
houseprice[['MSSubClass', 'LotFrontage']].groupby(['MSSubClass'], as_index=False).mean()  #跟MSSubClass进行分组,并求分组后的平均


#标准化(Z-Score)
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)
# scaler.mean_  
# scaler.std
scaler.transform(X)

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)

#正则化(Normalization)
normalizer = preprocessing.Normalizer().fit(X)
normalizer.transform(X)

#类别特征预处理
#1, 利用LabelEncoder类编码为整数
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df['color'] = class_le.fit_transform(df['color'].values)

#2, 映射字典将类别转换为整数
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classlabel']))}
df['classlabel'] = df['classlabel'].map(class_mapping)

#3,创建新的虚拟特征
# from sklearn.preprocessing import OneHotEncoder
pf = pd.get_dummies(df.color)
df = pd.concat([df, pf], axis=1)
df.drop(['color'], axis=1, inplace=True)

#连续特征二元化
new_target=preprocessing.binarize(boston.target,threshold=boston.target.mean) 
bin = preprocessing.Binarizer(boston.target.mean())
new_target = bin.fit_transform(boston.target)

#特征多项式化
from sklearn.preprocessing import PolynomialFeatures
quadratic_featurizer = PolynomialFeatures(degree=2)
train = quadratic_featurizer.fit_transform(train)

# 用决策树来预测、补充缺失值
notnull=data[pd.notnull(data.Age)]  
isnull=data[pd.isnull(data.Age)]  
from sklearn.ensemble import GradientBoostingRegressor  
G=GradientBoostingRegressor()  
G.fit(notnull[col].values,notnull.Age)  
isnull.Age=G.predict(isnull[col])  
data.Age[pd.isnull(data.Age)]=isnull.Age  

#PCA降维
from sklearn import decomposition
pca = decomposition.PCA(n_components=80)
train = pca.fit_transform(train)
pca.explained_variance_ratio_

#SVD降维
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(80)
train = svd.fit_transform(train)
print(svd.explained_variance_ratio_) 

#因子分解降维
from sklearn import decomposition
fa = decomposition.FacorAnalysis()
fa.fit_transform(X)

#核PCA降维
kpca = decomposition.KernelPCA(kernel='cosine',n_components=1)
X_transformed = kpca.fit_transform(X)
#运行计时
import time
#a=(2016,12,31,23,59,59,0,0,0) 
#start=time.mktime(a)    
start_time = time.time() 
#do something
print("time spent:", time.time() - start_time)  

#随机划分训练集和测试集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_user_product_info, labels, test_size=0.2, random_state=0)

#交叉验证
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, iris.data, iris.target, cv=5)
scores                                              
#array([ 0.96...,  1.  ...,  0.96...,  0.96...,  1.        ])

#交叉验证,每部分数据的分布都近似完整数据
from sklearn.model_selection import StratifiedKFold  
n_splits = 3
skf= StratifiedKFold(n_splits)  
for train_index, test_index in skf.split(train, label):  
    x_train,x_test = train[train_index], train[test_index]  
    y_train,y_test = label[train_index], label[test_index] 

#评估报告
from sklearn.metrics import classification_report
clf = LogisticRegression(solver='lbfgs', class_weight='balanced')#, penalty='l1'
clf.fit(x_train, y_train)  
y_pred = clf.predict(x_test)  
print(classification_report(y_test, y_pred, target_names = ['neg', 'pos']))  

#PR曲线
from sklearn.metrics import precision_recall_curve
answer = clf.predict_proba(x_test)[:,1]  
precision, recall, thresholds = precision_recall_curve(y_test, answer)   

#预测值与真实值分布
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
#bagging
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Lasso
from sklearn.utils import check_random_state
from sklearn.grid_search import ParameterGrid
rng = check_random_state(0)
grid = ParameterGrid({"max_samples": [0.8,0.9,1.0],
                      "max_features": [0.5,0.8,1.0],
                      "bootstrap": [True, False],
                      "bootstrap_features": [True, False]})
for params in grid:
    BagR = BaggingRegressor(base_estimator=Lasso(alpha=1.0),random_state=rng,**params)
    BagR.fit(X_train, y_train)
    BagR_y_predict = BagR.predict(X_test)
    print params,mean_squared_error((y_test),(BagR_y_predict))