🖼️ CNN卷积神经网络
CNN(Convolutional Neural Network) 是处理图像的神器,在计算机视觉领域占据主导地位。
🤔 为什么需要CNN?
用全连接网络处理图像有两个大问题:
- 参数太多:一张224×224×3的图片有150,528个像素,如果第一层有1000个神经元,就需要1.5亿个参数!
- 忽略空间关系:图片展平后,相邻像素的位置关系丢失了
CNN通过卷积操作解决了这两个问题:
- 参数共享:同一个卷积核扫描整张图片
- 局部连接:每个神经元只看一小块区域
📝 卷积操作详解
什么是卷积?
输入图片(5×5) 卷积核(3×3) 输出(3×3)
┌─────────────┐ ┌───────┐ ┌─────────┐
│ 1 0 1 0 1 │ │ 1 0 1 │ │ 4 3 4 │
│ 0 1 0 1 0 │ * │ 0 1 0 │ = │ 2 4 3 │
│ 1 0 1 0 1 │ │ 1 0 1 │ │ 4 3 4 │
│ 0 1 0 1 0 │ └───────┘ └─────────┘
│ 1 0 1 0 1 │
└─────────────┘
卷积核在图片上滑动,每个位置做元素乘法再求和
PyTorch中的卷积层
import torch
import torch.nn as nn
# 创建卷积层
# Conv2d(输入通道, 输出通道, 卷积核大小)
conv = nn.Conv2d(
in_channels=3, # 输入通道(RGB图片为3)
out_channels=16, # 输出通道(卷积核数量)
kernel_size=3, # 卷积核大小(3×3)
stride=1, # 步长
padding=1 # 填充
)
# 输入: (batch, channel, height, width)
x = torch.randn(1, 3, 32, 32) # 1张3通道32×32的图片
y = conv(x)
print(y.shape) # torch.Size([1, 16, 32, 32])
卷积参数详解
# 不同参数的效果
import torch.nn as nn
# 输入: 1×28×28
x = torch.randn(1, 1, 28, 28)
# 1. 基本卷积:输出变小
conv1 = nn.Conv2d(1, 16, kernel_size=3) # 28-3+1=26
y1 = conv1(x)
print(f"kernel=3, no padding: {y1.shape}") # [1, 16, 26, 26]
# 2. 加padding:保持大小不变
conv2 = nn.Conv2d(1, 16, kernel_size=3, padding=1) # 28-3+2+1=28
y2 = conv2(x)
print(f"kernel=3, padding=1: {y2.shape}") # [1, 16, 28, 28]
# 3. 加stride:输出变小(下采样)
conv3 = nn.Conv2d(1, 16, kernel_size=3, padding=1, stride=2) # 28/2=14
y3 = conv3(x)
print(f"stride=2: {y3.shape}") # [1, 16, 14, 14]
💡 输出大小公式
output_size = (input_size - kernel_size + 2*padding) / stride + 1
🔽 池化层
池化用于下采样,减小特征图大小:
import torch
import torch.nn as nn
# 最大池化
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
x = torch.tensor([[[[1., 2., 3., 4.],
[5., 6., 7., 8.],
[9., 10., 11., 12.],
[13., 14., 15., 16.]]]])
y = maxpool(x)
print(y)
# tensor([[[[ 6., 8.],
# [14., 16.]]]])
# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
y = avgpool(x)
print(y)
# tensor([[[[ 3.5, 5.5],
# [11.5, 13.5]]]])
🏗️ 构建CNN
经典结构
输入图片
↓
[卷积 → ReLU → 池化] × N ← 特征提取
↓
展平(Flatten)
↓
[全连接 → ReLU] × M ← 分类器
↓
输出
简单CNN实现
import torch
import torch.nn as nn
class SimpleCNN(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
# 特征提取器
self.features = nn.Sequential(
# 第1层卷积块: 1 → 32通道
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2), # 28→14
# 第2层卷积块: 32 → 64通道
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2), # 14→7
# 第3层卷积块: 64 → 128通道
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2), # 7→3
)
# 分类器
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 3 * 3, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
# 测试
model = SimpleCNN()
x = torch.randn(1, 1, 28, 28) # MNIST图片
y = model(x)
print(f"输出形状: {y.shape}") # [1, 10]
📊 经典CNN架构
LeNet(入门级)
class LeNet(nn.Module):
"""经典的LeNet-5,用于手写数字识别"""
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 6, 5) # 28→24
self.conv2 = nn.Conv2d(6, 16, 5) # 12→8
self.fc1 = nn.Linear(16 * 4 * 4, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = torch.relu(self.conv1(x))
x = nn.functional.max_pool2d(x, 2) # 24→12
x = torch.relu(self.conv2(x))
x = nn.functional.max_pool2d(x, 2) # 8→4
x = x.view(x.size(0), -1)
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.fc3(x)
return x
VGG风格(更深)
def make_vgg_block(in_channels, out_channels, num_convs):
"""创建VGG风格的卷积块"""
layers = []
for _ in range(num_convs):
layers.append(nn.Conv2d(in_channels, out_channels, 3, padding=1))
layers.append(nn.BatchNorm2d(out_channels))
layers.append(nn.ReLU())
in_channels = out_channels
layers.append(nn.MaxPool2d(2))
return nn.Sequential(*layers)
class VGGLike(nn.Module):
def __init__(self, num_classes=10):
super().__init__()
self.features = nn.Sequential(
make_vgg_block(3, 64, 2), # 输出: 64×16×16
make_vgg_block(64, 128, 2), # 输出: 128×8×8
make_vgg_block(128, 256, 3), # 输出: 256×4×4
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(256 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
🔧 数据增强
训练CNN时,数据增强非常重要:
from torchvision import transforms
# 训练时的数据增强
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(), # 随机水平翻转
transforms.RandomRotation(10), # 随机旋转±10度
transforms.RandomAffine(0, translate=(0.1, 0.1)), # 随机平移
transforms.ColorJitter(brightness=0.2, contrast=0.2), # 颜色抖动
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# 测试时不需要增强
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
🎯 完整训练示例:CIFAR-10
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# 设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 数据
transform_train = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, padding=4),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR10('./data', train=False, transform=transform_test)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=2)
# 模型
class CIFAR10Net(nn.Module):
def __init__(self):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(),
nn.Conv2d(32, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25),
nn.Conv2d(32, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25),
nn.Conv2d(64, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(),
nn.Conv2d(128, 128, 3, padding=1), nn.BatchNorm2d(128), nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(0.25),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return x
model = CIFAR10Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
# 训练
for epoch in range(50):
model.train()
for data, labels in train_loader:
data, labels = data.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(data)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
scheduler.step()
# 测试
model.eval()
correct = total = 0
with torch.no_grad():
for data, labels in test_loader:
data, labels = data.to(device), labels.to(device)
outputs = model(data)
_, predicted = outputs.max(1)
correct += predicted.eq(labels).sum().item()
total += labels.size(0)
print(f"Epoch {epoch+1}: 准确率 {100.*correct/total:.2f}%")
🏋️ 练习
# 练习:修改SimpleCNN,使其能处理CIFAR-10(3通道32×32图片)
# 要求:输出10个类别的概率
# 你的代码:
下一步
学会了CNN,接下来让我们学习处理序列数据的RNN循环网络!