目录

决策树1-分类器

目录

代码

1
2
3
4
5
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split     #分割训练集和测试集
from sklearn.neighbors import KNeighborsClassifier       #K近邻

简单案例

1
2
3
iris=datasets.load_iris()
iris_X = iris.data
iris_y=iris.target
1
iris_X[:2,:]#四个属性两朵花
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2]])
1
iris_y#三类花
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
1
2
#分割训练集和测试集
X_train,X_test,y_train,y_test=train_test_split(iris_X,iris_y,test_size=0.3)#测试占30%
1
y_train#顺便打乱了数据
array([0, 0, 2, 1, 0, 0, 2, 0, 0, 2, 0, 2, 1, 1, 2, 1, 2, 2, 2, 1, 2, 1,
       1, 0, 0, 2, 1, 1, 2, 1, 2, 1, 0, 2, 2, 0, 1, 1, 1, 0, 2, 0, 1, 0,
       1, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 2, 1, 1, 0, 0, 2, 2, 0, 1, 2,
       1, 0, 0, 0, 2, 0, 0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 1, 0, 1, 2, 0, 2,
       1, 2, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 2, 2, 0, 2, 0])
1
2
knn = KNeighborsClassifier() #使用K近邻分类
knn.fit(X_train,y_train)     #输入测试集
KNeighborsClassifier()
1
knn.predict(X_test)#预测
array([2, 2, 2, 1, 0, 2, 0, 2, 2, 0, 0, 0, 1, 0, 0, 2, 1, 0, 1, 1, 0, 2,
       2, 1, 1, 1, 0, 1, 0, 2, 2, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 0, 1, 1,
       1])
1
y_test
array([2, 2, 2, 1, 0, 1, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 1, 0, 1, 1, 0, 2,
       2, 1, 1, 1, 0, 1, 0, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 0, 1, 1,
       1])

模型导入训练

1
2
from sklearn import datasets #导入基础数据
from sklearn.linear_model import LinearRegression  #导入模型
1
2
3
4
#导入数据
loaded_data = datasets.load_boston()
data_X = loaded_data.data
data_y = loaded_data.target
1
2
3
#定义模型并训练
model = LinearRegression()#线性模型
model.fit(data_X,data_y)
LinearRegression()
1
2
#预测
model.predict(data_X[:4,:])
array([30.00384338, 25.02556238, 30.56759672, 28.60703649])
1
data_y[:4]
array([24. , 21.6, 34.7, 33.4])

import matplotlib.pyplot as plt

#创造数据

X,y = datasets.make_regression(n_samples=100,n_features=1,n_targets=1,noise=1)

#noise 离散程度

plt.scatter(X,y)#画点

1
2
#y=0.1x+0.3
#斜率 截距
1
model.coef_
array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])
1
model.intercept_
36.459488385090125
1
2
#输出之前模型定义的参数
model.get_params()
{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}
1
2
#打分评价
model.score(data_X,data_y)#R^2 coffiction of determination
0.7406426641094095

归一化

1
2
from sklearn import preprocessing
import numpy as np
1
2
3
4
a = np.array([[10,2.7,3.6],
             [-100,5,-2],
             [120,20,40]],dtype=np.float64)
a
array([[  10. ,    2.7,    3.6],
       [-100. ,    5. ,   -2. ],
       [ 120. ,   20. ,   40. ]])
1
2
#归一化
preprocessing.scale(a)
array([[ 0.        , -0.85170713, -0.55138018],
       [-1.22474487, -0.55187146, -0.852133  ],
       [ 1.22474487,  1.40357859,  1.40351318]])
1
2
#生成可以标准化的数据
from sklearn.datasets import make_classification
1
2
#支持向量机的分类器
from sklearn.svm import SVC
1
2
3
4
X,y = datasets.make_classification(n_samples=300,n_features=2,n_redundant=0,\
                                   n_informative=2,random_state=22,\
                                   n_clusters_per_class=1,\
                                   scale=100)
1
plt.scatter(X[:,0],X[:,1],c=y)
<matplotlib.collections.PathCollection at 0x7f1842f7faf0>

https://raw.githubusercontent.com/spiritLHL/tuchuang/master/output_31_1.png

1
2
3
4
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
clf = SVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
0.9333333333333333
1
2
3
4
5
6
7
8
9
#归一化
#X=preprocessing.scale(X,r)
#指定范围归一化
X=preprocessing.minmax_scale(X,feature_range=(-1,1))
#归一化后
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)
clf = SVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
0.9555555555555556

交叉验证

1
# cross_validation 修改为 model_selection
1
2
3
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split     #分割训练集和测试集
from sklearn.neighbors import KNeighborsClassifier       #K近邻
1
2
3
iris = load_iris()
X=iris.data
y=iris.target
1
2
3
4
5
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=4)#随机数复现
knn=KNeighborsClassifier(n_neighbors=5)#通过多少近邻预测
knn.fit(X_test,y_test)
y_pred = knn.predict(X_test)
knn.score(X_test,y_test)
0.9736842105263158
1
2
3
4
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)#按照这个看邻近个数
scores = cross_val_score(knn,X,y,cv=5,scoring='accuracy')#按准确度分5次
scores.mean()
0.9733333333333334
1
2
3
4
5
6
7
8
9
k_range = range(1,31)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)#按照这个看邻近个数
    scores = cross_val_score(knn,X,y,cv=10,scoring='accuracy')#按准确度分10次
    #loss = -cross_val_score(knn,X,y,cv=10,scoring='neg_mean_squared_error')#误差,线性
    #scores = cross_val_score(knn,X,y,cv=10,scoring='accuracy')#准确度
    #k_scores.append(loss.mean())#越小越好
    k_scores.append(scores.mean())#越大越好
1
2
3
4
5
#选取K
plt.plot(k_range,k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accurancy')
plt.show()

https://raw.githubusercontent.com/spiritLHL/tuchuang/master/output_41_0.png

处理过拟合

1
2
3
4
5
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits #数字库
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
1
2
3
digits = load_digits()
X = digits.data
y = digits.target
1
2
3
4
5
6
7
train_sizes,train_loss,test_loss=learning_curve(SVC(gamma=0.001),X,y,cv=10,
                                               scoring='neg_mean_squared_error',
                                               train_sizes=[0.1,0.25,0.5,0.75,1]
                                               )
#train_sizes记录评分点 
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
1
2
3
4
5
6
plt.plot(train_sizes,train_loss_mean,'o-',color="r",label="Training")
plt.plot(train_sizes,test_loss_mean,'o-',color="g",label="Cross-validation")
plt.xlabel("Training example")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>

https://raw.githubusercontent.com/spiritLHL/tuchuang/master/output_46_1.png

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits #数字库
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits = load_digits()
X = digits.data
y = digits.target
param_range = np.logspace(-6,-2.3,5)#定义SVC的伽马范围
train_loss,test_loss=validation_curve(SVC(),X,y,param_name='gamma',
                                      param_range=param_range,cv=10,
                                      scoring='neg_mean_squared_error'
                                      )
#train_sizes记录评分点 
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
plt.plot(param_range,train_loss_mean,'o-',color="r",label="Training")
plt.plot(param_range,test_loss_mean,'o-',color="g",label="Cross-validation")
#验证曲线 选取合适gamma值 学习率
plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>

https://raw.githubusercontent.com/spiritLHL/tuchuang/master/output_47_1.png

保存model

1
2
3
4
5
6
from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X,y = iris.data,iris.target
clf.fit(X,y)
SVC()
1
2
3
4
5
#method pickle
#import _pickle as cPickle
import pickle
with open('SAVE/clf.pickle','wb') as f:
    pickle.dump(clf,f)
1
2
with open('SAVE/clf.pickle','rb') as f:
    clf2 = pickle.load(f)
1
clf2.predict(X[0:1])
array([0])
1
2
3
4
# method joblib
import joblib
#Save
joblib.dump(clf,'SAVE/clf.pkl')
['SAVE/clf.pkl']
1
2
clf3 = joblib.load('SAVE/clf.pkl')
clf3.predict(X[0:1])
array([0])