SVM算法核心原理讲解
支持向量机(Support Vector Machine, SVM)是一种基于统计学习理论的监督学习算法,其核心思想是找到一个最优超平面,使得不同类别的样本点能够被最大程度地分开。
最大间隔原理
SVM的核心是最大间隔分类器。在二维空间中,这个"间隔"就是两条平行线之间的距离;在高维空间中,则是超平面之间的间隔。算法目标是找到使间隔最大化的决策边界。
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
# 创建简单的二分类数据
X = np.array([[1, 2], [2, 3], [3, 3], [2, 1], [3, 2]])
y = np.array([1, 1, 1, -1, -1])
# 创建SVM分类器
clf = svm.SVC(kernel='linear')
clf.fit(X, y)
# 可视化决策边界
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr')
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# 创建网格来绘制决策边界
xx = np.linspace(xlim[0], xlim[1], 30)
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy, xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
# 绘制决策边界和间隔
ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
plt.title('SVM最大间隔原理演示')
plt.show()核函数技巧
当数据线性不可分时,SVM通过核函数将数据映射到更高维的空间,使其在新空间中线性可分。常用的核函数包括:
- 线性核:
K(x, y) = x^T y - 多项式核:
K(x, y) = (γx^T y + r)^d - RBF核:
K(x, y) = exp(-γ||x - y||^2) - Sigmoid核:
K(x, y) = tanh(γx^T y + r)
sklearn.svm模块的主要类和函数介绍
scikit-learn提供了完整的SVM实现,主要包含以下核心类:
分类器类
| 类名 | 描述 | 适用场景 |
|---|---|---|
SVC | C-支持向量分类 | 通用分类任务 |
NuSVC | Nu-支持向量分类 | 需要控制支持向量数量 |
LinearSVC | 线性支持向量分类 | 大规模线性分类 |
回归器类
| 类名 | 描述 | 特点 |
|---|---|---|
SVR | Epsilon-支持向量回归 | 支持多种核函数 |
NuSVR | Nu-支持向量回归 | 可以控制支持向量数量 |
LinearSVR | 线性支持向量回归 | 大规模数据高效 |
核心参数详解
from sklearn.svm import SVC
# SVC主要参数
svm_clf = SVC(
C=1.0, # 惩罚参数,控制间隔与误分类的权衡
kernel='rbf', # 核函数类型
degree=3, # 多项式核的次数
gamma='scale', # 核函数系数
coef0=0.0, # 核函数独立项
shrinking=True, # 是否使用启发式收缩
probability=False, # 是否启用概率估计
tol=1e-3, # 停止准则的容差
cache_size=200, # 核函数缓存大小(MB)
class_weight=None, # 类别权重
verbose=False, # 是否启用详细输出
max_iter=-1, # 最大迭代次数,-1表示无限制
decision_function_shape='ovr' # 多分类策略
)分类和回归任务的实战代码示例
二分类任务实战
让我们使用经典的鸢尾花数据集来演示SVM在二分类中的应用:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
# 加载鸢尾花数据集(取前两个类别)
iris = datasets.load_iris()
X = iris.data[iris.target != 2] # 只取两个类别
y = iris.target[iris.target != 2]
# 数据预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.3, random_state=42, stratify=y
)
# 创建SVM分类器
svm_clf = SVC(kernel='rbf', random_state=42)
# 超参数调优
param_grid = {
'C': [0.1, 1, 10, 100],
'gamma': ['scale', 0.001, 0.01, 0.1, 1],
'kernel': ['rbf', 'linear', 'poly']
}
grid_search = GridSearchCV(svm_clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)
print("最佳交叉验证分数:", grid_search.best_score_)
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# 评估模型性能
print("\n分类报告:")
print(classification_report(y_test, y_pred))
print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
# 可视化决策边界(选择前两个特征)
X_2d = X_scaled[:, :2]
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split(
X_2d, y, test_size=0.3, random_state=42, stratify=y
)
best_model_2d = SVC(**grid_search.best_params_)
best_model_2d.fit(X_train_2d, y_train_2d)
# 绘制决策边界
h = 0.02
x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = best_model_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
scatter = plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black')
plt.xlabel('Feature 1 (标准化后)')
plt.ylabel('Feature 2 (标准化后)')
plt.title('SVM二分类决策边界可视化')
plt.colorbar(scatter)
plt.show()多分类任务实现
SVM天然支持多分类,常用的策略有一对一(OvO)和一对多(OvR):
from sklearn.svm import SVC
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
import numpy as np
# 创建多分类数据集
X, y = make_classification(n_samples=1000, n_features=20, n_classes=5,
n_informative=15, random_state=42)
# 原生多分类SVM(OvR策略)
svm_ovr = SVC(kernel='rbf', decision_function_shape='ovr', random_state=42)
ovr_scores = cross_val_score(svm_ovr, X, y, cv=5)
# 一对一策略
svm_ovo = SVC(kernel='rbf', decision_function_shape='ovo', random_state=42)
ovo_scores = cross_val_score(svm_ovo, X, y, cv=5)
print(f"OvR策略平均准确率: {ovr_scores.mean():.3f} (+/- {ovr_scores.std() * 2:.3f})")
print(f"OvO策略平均准确率: {novo_scores.mean():.3f} (+/- {novo_scores.std() * 2:.3f})")
# 使用OneVsOneClassifier包装器
ovo_clf = OneVsOneClassifier(SVC(kernel='rbf', random_state=42))
ovo_scores_wrapper = cross_val_score(ovo_clf, X, y, cv=5)
print(f"OneVsOneClassifier包装器平均准确率: {ovo_scores_wrapper.mean():.3f}")回归任务实战
SVM也可以用于回归任务,称为支持向量回归(SVR):
from sklearn.svm import SVR
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# 创建回归数据集
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
# 数据标准化
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).ravel()
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y_scaled, test_size=0.2, random_state=42
)
# 创建SVR模型
svr = SVR(kernel='rbf')
# 超参数调优
param_grid = {
'C': [0.1, 1, 10, 100],
'epsilon': [0.01, 0.1, 0.5, 1.0],
'gamma': ['scale', 0.001, 0.01, 0.1]
}
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)
# 预测和评估
best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test)
# 转换回原始尺度
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()
y_pred_original = scaler_y.inverse_transform(y_pred.reshape(-1, 1)).ravel()
print(f"均方误差: {mean_squared_error(y_test_original, y_pred_original):.3f}")
print(f"R²分数: {r2_score(y_test_original, y_pred_original):.3f}")
# 可视化预测结果
plt.figure(figsize=(10, 6))
plt.scatter(y_test_original, y_pred_original, alpha=0.6)
plt.plot([y_test_original.min(), y_test_original.max()],
[y_test_original.min(), y_test_original.max()], 'r--', lw=2)
plt.xlabel('真实值')
plt.ylabel('预测值')
plt.title('SVR回归预测结果')
plt.show()
# 分析支持向量
print(f"支持向量数量: {len(best_svr.support_)}")
print(f"支持向量比例: {len(best_svr.support_) / len(X_train):.3f}")核函数选择与参数调优技巧
核函数选择策略
选择合适的核函数是SVM成功的关键。以下是基于数据特性的选择建议:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import validation_curve
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
# 创建不同类型的数据集
def create_datasets():
# 线性可分数据
X_linear, y_linear = make_classification(n_samples=300, n_features=2,
n_redundant=0, n_informative=2,
n_clusters_per_class=1, random_state=42)
# 非线性数据(圆形分布)
np.random.seed(42)
n_samples = 300
X_circle = np.random.randn(n_samples, 2)
y_circle = np.zeros(n_samples)
for i in range(n_samples):
if X_circle[i, 0]**2 + X_circle[i, 1]**2 > 1:
y_circle[i] = 1
return (X_linear, y_linear), (X_circle, y_circle)
(X_linear, y_linear), (X_circle, y_circle) = create_datasets()
# 测试不同核函数的性能
def compare_kernels(X, y, title):
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
scores = {}
for kernel in kernels:
svm_clf = SVC(kernel=kernel, random_state=42)
train_scores, val_scores = validation_curve(
svm_clf, X, y, param_name='C', param_range=np.logspace(-3, 3, 7), cv=5
)
scores[kernel] = val_scores.mean(axis=1)
# 绘制结果
plt.figure(figsize=(10, 6))
for kernel in kernels:
plt.semilogx(np.logspace(-3, 3, 7), scores[kernel], label=f'{kernel}核')
plt.xlabel('C参数')
plt.ylabel('交叉验证准确率')
plt.title(f'{title} - 核函数性能比较')
plt.legend()
plt.grid(True)
plt.show()
return scores
# 比较线性可分数据
linear_scores = compare_kernels(X_linear, y_linear, "线性可分数据")
# 比较非 线性数据
circle_scores = compare_kernels(X_circle, y_circle, "非线性数据")参数调优最佳实践
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from scipy.stats import expon, reciprocal
import numpy as np
# 1. 网格搜索(适合参数空间较小的情况)
def grid_search_svm(X, y):
param_grid = [
# 线性核参数
{'kernel': ['linear'], 'C': [0.1, 1, 10, 100]},
# RBF核参数
{'kernel': ['rbf'], 'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]},
# 多项式核参数
{'kernel': ['poly'], 'C': [0.1, 1, 10], 'degree': [2, 3, 4], 'gamma': [0.001, 0.01]}
]
svm_clf = SVC(random_state=42)
grid_search = GridSearchCV(svm_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)
return grid_search.best_params_, grid_search.best_score_
# 2. 随机搜索(适合大参数空间)
def random_search_svm(X, y):
param_dist = {
'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
'C': reciprocal(0.001, 1000),
'gamma': expon(scale=0.1),
'degree': [2, 3, 4, 5]
}
svm_clf = SVC(random_state=42)
random_search = RandomizedSearchCV(
svm_clf, param_dist, n_iter=100, cv=5, random_state=42, n_jobs=-1
)
random_search.fit(X, y)
return random_search.best_params_, random_search.best_score_
# 3. 贝叶斯优化(推荐)
def bayesian_optimization_svm(X, y):
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
search_space = {
'kernel': Categorical(['linear', 'rbf', 'poly']),
'C': Real(0.001, 1000, prior='log-uniform'),
'gamma': Real(0.0001, 1, prior='log-uniform'),
'degree': Integer(2, 5)
}
svm_clf = SVC(random_state=42)
bayes_search = BayesSearchCV(
svm_clf, search_space, n_iter=50, cv=5, random_state=42, n_jobs=-1
)
bayes_search.fit(X, y)
return bayes_search.best_params_, bayes_search.best_score_特征缩放的重要性
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
# 演示不同缩放方法对SVM的影响
def demonstrate_scaling_importance():
# 创建具有不同尺度的特征数据
np.random.seed(42)
X = np.random.randn(1000, 3)
X[:, 0] *= 1000 # 第一个特征尺度 很大
X[:, 1] *= 0.01 # 第二个特征尺度很小
y = (X.sum(axis=1) > 0).astype(int)
# 不同缩放方法
scalers = {
'无缩放': None,
'StandardScaler': StandardScaler(),
'MinMaxScaler': MinMaxScaler(),
'RobustScaler': RobustScaler()
}
results = {}
for name, scaler in scalers.items():
if scaler is None:
svm_clf = SVC(kernel='rbf', random_state=42)
scores = cross_val_score(svm_clf, X, y, cv=5)
else:
pipeline = Pipeline([
('scaler', scaler),
('svm', SVC(kernel='rbf', random_state=42))
])
scores = cross_val_score(pipeline, X, y, cv=5)
results[name] = scores.mean()
print(f"{name}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
return results
scaling_results = demonstrate_scaling_importance()