安装:
pip install -U scikit-learn
数据标准化
from sklearn import preprocessinga = np.array([[10, 2.7, 3.6], [-100, 5, -2], [120, 20, 40]], dtype=np.float64)res = preprocessing.scale(a)'''[[ 0. -0.85170713 -0.55138018] [-1.22474487 -0.55187146 -0.852133 ] [ 1.22474487 1.40357859 1.40351318]]'''
训练集测试集分割
分割的同时会打乱数据顺序
from sklearn.cross_validation import train_test_splita = np.arange(40).reshape(10, 4)train, test = train_test_split(a, test_size=0.3)'''[[20 21 22 23] [28 29 30 31] [ 0 1 2 3] [ 4 5 6 7] [12 13 14 15] [16 17 18 19] [32 33 34 35]][[24 25 26 27] [36 37 38 39] [ 8 9 10 11]]'''
生成测试数据
from sklearn.datasets.samples_generator import make_classificationX, y = make_classification()X, y = make_classification(n_samples=100, n_features=3, n_classes=3, n_redundant=0, n_informative=3, random_state=22, n_clusters_per_class=2, scale=100)[[ 76.25530311 104.61848707 -124.51537604] [ 14.00181364 218.02679285 -189.2891996 ] [ 9.52106591 146.5469016 -19.52563088] ... [ 49.31656749 74.3968861 -82.10408259] [ 333.62879001 240.97683418 -262.34040171] [-263.15760407 29.60401483 -267.22306047]][1 0 0 1 0 0 0 2 1 0 1 1 0 2 0 0 2 1 1 0 1 0 0 2 2 2 1 0 1 0 2 2 0 2 1 2 2 1 1 2 0 0 1 2 2 2 0 2 2 1 1 0 1 2 0 0 0 2 1 2 2 1 0 0 0 0 1 1 0 1 1 0 1 0 1 1 2 1 0 2 2 2 2 0 1 2 0 1 0 0 0 1 0 0 2 1 0 2 1 1]
分类算法
from sklearn.neighbors import KNeighborsClassifierX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)knn1 = KNeighborsClassifier()knn2 = KNeighborsClassifier(n_neighbors=7)knn1.fit(X_train, y_train)print(knn1.score(X_test, y_test)) #训练模型在测试数据集上的准确率得分knn2.fit(X_train, y_train)print(knn2.score(X_test, y_test))print(knn2.predict(X_test[:2])) #对测试数据集前两条进行分类预测:[1 2]
分类demo
直接对原数据和对原数据归一化后
from sklearn import preprocessingfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn.svm import SVCX, y = make_classification()print(X);print()print(y);print()X_train, X_test, y_train, y_test = train_test_split(X, y)knn1 = KNeighborsClassifier()knn1.fit(X_train, y_train)r1_knn = knn1.score(X_test, y_test)print('r1_knn: ', r1_knn)clf1 = SVC()clf1.fit(X_train, y_train)r1_clf = clf1.score(X_test, y_test)print('r1_clf: ', r1_clf)X_s = preprocessing.minmax_scale(X, feature_range=(-1, 1))X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_s, y)knn2 = KNeighborsClassifier()knn2.fit(X_train_s, y_train_s)r2_knn = knn2.score(X_test_s, y_test_s)print('r2_knn: ', r2_knn)clf2 = SVC()clf2.fit(X_train_s, y_train_s)r2_clf = clf2.score(X_test_s, y_test_s)print('r2_clf: ', r2_clf)
交叉验证,可用于选择模型
所谓交叉验证就是自动将原始数据集做切分,一部分作为训练数据,另一部分作为测试数据。
然后用训练数据集进行模型训练,再计算训练出的模型在测试数据集上的准确率得分。 以上步骤进行多次。具体怎么切分和得分怎么计算要看交叉验证的参数配置。from sklearn.cross_validation import cross_val_scorefrom sklearn.neighbors import KNeighborsClassifierX, y = make_classification(n_samples=3000, n_features=5, n_classes=3, n_redundant=0, n_informative=4, random_state=22, n_clusters_per_class=2, scale=100)score_arr = cross_val_score(KNeighborsClassifier(), X, y, cv=10) #cv=10会造成进行10次切分和计算,计算训练数据集训练出的模型在测试数据集上的得分print(score_arr.mean())
辅助模型选择参数
import matplotlib.pyplot as pltfrom sklearn.learning_curve import validation_curve #验证曲线from sklearn.datasets.samples_generator import make_classificationfrom sklearn.svm import SVCimport numpy as npX, y = make_classification(n_samples=1000, n_features=5, n_classes=3, n_redundant=0, n_informative=4, random_state=0, n_clusters_per_class=2, scale=100)param_name = 'gamma' #验证在此数据集上SVC分类器在不同gamma参数下的分类效果param_range = np.logspace(-8, -1, 8) #参数值:[1.e-08 1.e-07 1.e-06 1.e-05 1.e-04 1.e-03 1.e-02 1.e-01]train_score, test_score = validation_curve(SVC(), X, y, param_name=param_name, param_range=param_range, cv=7, n_jobs=4) #n_jobs:线程数,默认1'''上个语句说明:分别计算gamma取不同值下,使用交叉验证,在X,y数据集上的训练数据得分和测试数据得分。输出形式如下(此处交叉验证参数cv=7,也就是对每个gamma值计算7个得分结果,共对8个gamma值进行计算):[[0.34813084 0.34889148 0.3453909 0.34305718 0.34189032 0.34305718 0.35040745] [0.67640187 0.68494749 0.67677946 0.67911319 0.68494749 0.6907818 0.69383003]... [1. 1. 1. 1. 1. 1. 1. ] [1. 1. 1. 1. 1. 1. 1. ]] [[0.36111111 0.34265734 0.34265734 0.34265734 0.33566434 0.34265734 0.35460993] [0.70138889 0.67832168 0.6993007 0.67832168 0.6993007 0.67132867 0.65248227] ... [0.34027778 0.33566434 0.33566434 0.33566434 0.33566434 0.33566434 0.34042553] [0.34027778 0.33566434 0.33566434 0.33566434 0.33566434 0.33566434 0.34042553]] '''plt.figure()plt.semilogx(param_range, train_score.mean(axis=1), label='train') # 画出每个gamma值下计算的训练集交叉验证平均得分plt.semilogx(param_range, test_score.mean(axis=1), 'r--', label='test') # 画出每个gamma值下计算的测试集交叉验证平均得分plt.xlabel(param_name)plt.ylabel('score')plt.ylim(0.3, 1.05) # Y轴范围plt.title("validation_curve") plt.legend() # 显示图例plt.show()可以看出,随着
gamma
值的增大,模型从拟合不足到拟合良好再到过拟合(对训练数据得分很高,但对测试数据得分低)。 最佳的gamma
值为测试数据得分最高的地方(红虚线最高点)。 输出学习曲线
from sklearn.model_selection import learning_curve # sklearn.learning_curve和sklearn.model_selection下相同函数目前没发现区别train_sizes = [0.2, 0.4, 0.6, 0.8, 1] # 训练样本数量,如0.2会用X的20%样本数据进行(训练集和测试集的)切分,然后进行交叉验证train_size, train_scores, test_scores = learning_curve(SVC(gamma=0.00001), X, y, cv=7, train_sizes=train_sizes) # train_size:参与此次训练的样本数量,非百分比plt.plot(train_size, train_scores.mean(1), 'o-', c='b', label='train')plt.plot(train_size, test_scores.mean(1), '*--', c='r', label='test')plt.xlabel('train_sizes')plt.ylabel('score')plt.title("learning_curve")plt.legend(loc='best')plt.show()print(train_size)
保存、加载模型文件
from sklearn.externals import joblibjoblib.dump(knn1, 'd:/train100.knn')knn = joblib.load('d:/train100.knn')knn.predict(X[0:1])
要获取一个对象构造方法的参数列表和参数默认值用get_params()
:
knn = KNeighborsClassifier()knn.get_params()'''{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'} '''