✍️ MNIST手写数字识别
这是深度学习的"Hello World"项目!我们将构建一个能识别手写数字的模型。
🎯 项目目标
- 识别0-9的手写数字
- 准确率达到98%以上
- 掌握完整的深度学习训练流程
📊 数据集介绍
MNIST数据集包含:
- 60,000张训练图片
- 10,000张测试图片
- 图片大小:28×28像素
- 灰度图(单通道)
样本示例:
┌────────────────────────┐
│ ▓▓▓▓▓▓ │
│ ▓▓▓▓ │
│ ▓▓ │
│ ▓▓ │
│ ▓▓ │ → 标签: 1
│ ▓▓ │
│ ▓▓ │
│ ▓▓ │
│ ▓▓▓▓▓▓ │
└────────────────────────┘
📝 完整代码
方案一:全连接网络(入门)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
# ==================== 配置 ====================
BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {DEVICE}")
# ==================== 数据加载 ====================
# 数据预处理
transform = transforms.Compose([
transforms.ToTensor(), # 转为张量,值从[0,255]变为[0,1]
transforms.Normalize((0.1307,), (0.3081,)) # MNIST的均值和标准差
])
# 下载并加载数据
train_dataset = datasets.MNIST(
root='./data',
train=True,
download=True,
transform=transform
)
test_dataset = datasets.MNIST(
root='./data',
train=False,
transform=transform
)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
print(f"训练集: {len(train_dataset)} 样本")
print(f"测试集: {len(test_dataset)} 样本")
# ==================== 可视化样本 ====================
def show_samples(dataset, num=10):
"""显示数据集中的样本"""
fig, axes = plt.subplots(1, num, figsize=(12, 2))
for i in range(num):
image, label = dataset[i]
axes[i].imshow(image.squeeze(), cmap='gray')
axes[i].set_title(f'{label}')
axes[i].axis('off')
plt.tight_layout()
plt.savefig('mnist_samples.png')
plt.show()
# show_samples(train_dataset)
# ==================== 定义模型 ====================
class MLP(nn.Module):
"""多层感知机"""
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.layers = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 10)
)
def forward(self, x):
x = self.flatten(x)
return self.layers(x)
model = MLP().to(DEVICE)
print(model)
# 统计参数量
total_params = sum(p.numel() for p in model.parameters())
print(f"总参数量: {total_params:,}")
# ==================== 训练配置 ====================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# ==================== 训练函数 ====================
def train_epoch(model, loader, criterion, optimizer, device):
model.train()
total_loss = 0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
total_loss += loss.item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
total += target.size(0)
return total_loss / len(loader), 100. * correct / total
# ==================== 测试函数 ====================
def test(model, loader, criterion, device):
model.eval()
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for data, target in loader:
data, target = data.to(device), target.to(device)
output = model(data)
loss = criterion(output, target)
total_loss += loss.item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
total += target.size(0)
return total_loss / len(loader), 100. * correct / total
# ==================== 训练循环 ====================
history = {'train_loss': [], 'train_acc': [], 'test_loss': [], 'test_acc': []}
for epoch in range(EPOCHS):
train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, DEVICE)
test_loss, test_acc = test(model, test_loader, criterion, DEVICE)
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['test_loss'].append(test_loss)
history['test_acc'].append(test_acc)
print(f"Epoch {epoch+1}/{EPOCHS}")
print(f" Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
print(f" Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")
print(f"\n最终测试准确率: {history['test_acc'][-1]:.2f}%")
# ==================== 保存模型 ====================
torch.save(model.state_dict(), 'mnist_mlp.pth')
print("模型已保存到 mnist_mlp.pth")
# ==================== 可视化训练过程 ====================
def plot_history(history):
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# 损失曲线
axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['test_loss'], label='Test')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Loss Curve')
axes[0].legend()
# 准确率曲线
axes[1].plot(history['train_acc'], label='Train')
axes[1].plot(history['test_acc'], label='Test')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy (%)')
axes[1].set_title('Accuracy Curve')
axes[1].legend()
plt.tight_layout()
plt.savefig('training_history.png')
plt.show()
# plot_history(history)
方案二:CNN(更高准确率)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# ==================== 配置 ====================
BATCH_SIZE = 128
EPOCHS = 15
LEARNING_RATE = 0.001
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# ==================== 数据加载 ====================
transform_train = transforms.Compose([
transforms.RandomRotation(10), # 数据增强:随机旋转
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.MNIST('./data', train=False, transform=transform_test)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
# ==================== CNN模型 ====================
class CNN(nn.Module):
def __init__(self):
super().__init__()
# 卷积层
self.conv_layers = nn.Sequential(
# 第1层: 1→32通道
nn.Conv2d(1, 32, kernel_size=3, padding=1), # 28x28
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2), # 14x14
nn.Dropout(0.25),
# 第2层: 32→64通道
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2), # 7x7
nn.Dropout(0.25),
)
# 全连接层
self.fc_layers = nn.Sequential(
nn.Flatten(),
nn.Linear(64 * 7 * 7, 512),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.conv_layers(x)
x = self.fc_layers(x)
return x
model = CNN().to(DEVICE)
print(model)
# ==================== 训练 ====================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
best_acc = 0
for epoch in range(EPOCHS):
# 训练
model.train()
train_loss = 0
train_correct = 0
train_total = 0
for data, target in train_loader:
data, target = data.to(DEVICE), target.to(DEVICE)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()
pred = output.argmax(dim=1)
train_correct += pred.eq(target).sum().item()
train_total += target.size(0)
# 测试
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(DEVICE), target.to(DEVICE)
output = model(data)
pred = output.argmax(dim=1)
test_correct += pred.eq(target).sum().item()
test_total += target.size(0)
train_acc = 100. * train_correct / train_total
test_acc = 100. * test_correct / test_total
print(f"Epoch {epoch+1}/{EPOCHS}: "
f"Train Acc: {train_acc:.2f}%, Test Acc: {test_acc:.2f}%")
# 保存最佳模型
if test_acc > best_acc:
best_acc = test_acc
torch.save(model.state_dict(), 'mnist_cnn_best.pth')
scheduler.step()
print(f"\n最佳测试准确率: {best_acc:.2f}%")
🔍 预测示例
import torch
from PIL import Image
import torchvision.transforms as transforms
# 加载模型
model = CNN()
model.load_state_dict(torch.load('mnist_cnn_best.pth'))
model.eval()
# 预测函数
def predict(image_tensor):
with torch.no_grad():
output = model(image_tensor.unsqueeze(0))
prob = torch.softmax(output, dim=1)
pred = output.argmax(dim=1).item()
confidence = prob[0][pred].item()
return pred, confidence
# 从测试集预测
test_dataset = datasets.MNIST('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
# 预测前10个样本
for i in range(10):
image, true_label = test_dataset[i]
pred, conf = predict(image)
print(f"样本{i}: 预测={pred}, 真实={true_label}, 置信度={conf:.2%}")
📈 优化建议
1. 提高准确率
- 增加数据增强(旋转、缩放、平移)
- 使用更深的网络
- 使用学习率调度
- 增加训练轮次
2. 加快训练
- 使用GPU
- 增大batch_size
- 使用混合精度训练
3. 减少过拟合
- 增加Dropout
- 使用早停
- 使用正则化(权重衰减)
❓ 常见问题
Q: 为什么我的准确率只有90%多?
A: 检查以下几点:
- 是否使用了数据标准化
- 学习率是否合适
- 是否训练了足够多的轮次
Q: 如何识别自己手写的数字?
A:
- 拍照或截图
- 转为灰度图并调整为28×28
- 标准化后输入模型
下一个项目
准备好了吗?让我们挑战更复杂的图像分类器!