🧩 torch.nn 快速入门
torch.nn 是PyTorch构建神经网络的核心模块。本节将帮助你理解它的设计理念和基本用法。
🤔 torch.nn 是什么?
torch.nn 提供了构建神经网络所需的所有组件:
import torch.nn as nn
# nn 包含:
# - 网络层:Linear, Conv2d, LSTM, Transformer...
# - 激活函数:ReLU, Sigmoid, Softmax...
# - 损失函数:CrossEntropyLoss, MSELoss...
# - 容器:Sequential, ModuleList, ModuleDict...
# - 工具:Dropout, BatchNorm, Embedding...
📦 nn.Module:神经网络的基石
所有神经网络模型都继承自 nn.Module:
import torch
import torch.nn as nn
class MyNetwork(nn.Module):
def __init__(self):
super().__init__() # 必须调用父类初始化
# 在这里定义网络的层
self.layer1 = nn.Linear(10, 20)
self.layer2 = nn.Linear(20, 5)
def forward(self, x):
# 定义前向传播逻辑
x = self.layer1(x)
x = torch.relu(x)
x = self.layer2(x)
return x
# 创建模型
model = MyNetwork()
print(model)
为什么用 nn.Module?
# nn.Module 自动帮你做了很多事情:
# 1. 参数管理
for name, param in model.named_parameters():
print(f"{name}: {param.shape}")
# 2. 设备移动
model.to('cuda') # 所有参数自动移到GPU
# 3. 训练/评估模式切换
model.train() # 训练模式
model.eval() # 评估模式
# 4. 保存/加载
torch.save(model.state_dict(), 'model.pth')
model.load_state_dict(torch.load('model.pth'))
🧱 常用网络层
全连接层 (Linear)
import torch.nn as nn
# 创建全连接层:输入10维,输出5维
fc = nn.Linear(10, 5)
# 内部参数
print(f"权重形状: {fc.weight.shape}") # [5, 10]
print(f"偏置形状: {fc.bias.shape}") # [5]
# 使用
x = torch.randn(32, 10) # 32个样本
y = fc(x)
print(f"输出形状: {y.shape}") # [32, 5]
卷积层 (Conv2d)
# 2D卷积:1个输入通道,16个输出通道,3x3卷积核
conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
x = torch.randn(32, 1, 28, 28) # 32张 28x28 灰度图
y = conv(x)
print(f"输出形状: {y.shape}") # [32, 16, 28, 28]
池化层 (Pooling)
# 最大池化
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
x = torch.randn(32, 16, 28, 28)
y = maxpool(x)
print(f"输出形状: {y.shape}") # [32, 16, 14, 14] # 尺寸减半
# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2)
# 自适应池化(输出固定大小)
adaptive_pool = nn.AdaptiveAvgPool2d((1, 1)) # 输出 1x1
循环层 (LSTM/GRU)
# LSTM:输入10维,隐藏层20维,2层
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
x = torch.randn(32, 15, 10) # 32个样本,15个时间步,10维特征
output, (hidden, cell) = lstm(x)
print(f"输出形状: {output.shape}") # [32, 15, 20]
print(f"隐藏状态: {hidden.shape}") # [2, 32, 20]
Transformer
# Transformer编码器层
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
x = torch.randn(10, 32, 512) # 10个token,32个样本,512维
output = transformer_encoder(x)
🎨 激活函数
import torch
import torch.nn as nn
import torch.nn.functional as F
x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
# 方式1:作为模块使用
relu_module = nn.ReLU()
y = relu_module(x)
# 方式2:作为函数使用(推荐)
y = F.relu(x)
y = torch.relu(x) # 等价
# 常用激活函数
F.relu(x) # max(0, x)
F.sigmoid(x) # 1/(1+e^(-x))
F.tanh(x) # 双曲正切
F.leaky_relu(x) # 带泄漏的ReLU
F.gelu(x) # 高斯误差线性单元(Transformer常用)
F.softmax(x, dim=0) # Softmax
何时用模块,何时用函数?
# 在 __init__ 中使用模块
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.relu = nn.ReLU() # 作为模块定义
def forward(self, x):
# 两种方式都可以
x = self.relu(x) # 使用模块
x = F.relu(x) # 使用函数(更简洁)
return x
# 推荐:无参数的激活函数用 F.xxx
# 有参数的层(如Linear)必须用模块
📊 正则化层
Dropout
dropout = nn.Dropout(p=0.5) # 50%概率丢弃
# 训练时随机丢弃
model.train()
y = dropout(x) # 部分值变为0
# 评估时不丢弃
model.eval()
y = dropout(x) # 所有值保留(会自动缩放)
BatchNorm
# 对每个特征进行归一化
bn1d = nn.BatchNorm1d(num_features=100) # 用于全连接层
bn2d = nn.BatchNorm2d(num_features=64) # 用于卷积层
x = torch.randn(32, 64, 28, 28) # 卷积特征图
y = bn2d(x)
# BatchNorm也有训练/评估模式的区别
model.train() # 使用batch统计量
model.eval() # 使用运行时统计量
LayerNorm
# LayerNorm:对每个样本归一化(Transformer常用)
layer_norm = nn.LayerNorm(normalized_shape=512)
x = torch.randn(32, 10, 512) # 32个样本,10个token,512维
y = layer_norm(x)
🔗 容器:组织网络层
nn.Sequential
# 最简单的方式:按顺序堆叠层
model = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(128, 10)
)
# 使用
x = torch.randn(32, 784)
y = model(x) # 依次通过所有层
nn.ModuleList
# 当你需要动态数量的层时
class DynamicNet(nn.Module):
def __init__(self, layer_sizes):
super().__init__()
self.layers = nn.ModuleList([
nn.Linear(layer_sizes[i], layer_sizes[i+1])
for i in range(len(layer_sizes)-1)
])
def forward(self, x):
for layer in self.layers[:-1]:
x = F.relu(layer(x))
x = self.layers[-1](x) # 最后一层不加激活
return x
model = DynamicNet([784, 512, 256, 128, 10])
nn.ModuleDict
# 用字典组织层
class MultiHeadModel(nn.Module):
def __init__(self):
super().__init__()
self.backbone = nn.Linear(784, 256)
self.heads = nn.ModuleDict({
'classification': nn.Linear(256, 10),
'regression': nn.Linear(256, 1),
'embedding': nn.Linear(256, 128)
})
def forward(self, x, task='classification'):
x = F.relu(self.backbone(x))
return self.heads[task](x)
model = MultiHeadModel()
y = model(x, task='classification')
⚠️ 为什么必须用 ModuleList/ModuleDict?
# ❌ 错误:普通list不会注册参数
self.layers = [nn.Linear(10, 10) for _ in range(3)]
# model.parameters() 不会包含这些层!
# ✅ 正确:使用ModuleList
self.layers = nn.ModuleList([nn.Linear(10, 10) for _ in range(3)])
📉 损失函数
import torch.nn as nn
# 分类损失
criterion = nn.CrossEntropyLoss() # 多分类
criterion = nn.BCEWithLogitsLoss() # 二分类
criterion = nn.NLLLoss() # 负对数似然
# 回归损失
criterion = nn.MSELoss() # 均方误差
criterion = nn.L1Loss() # 平均绝对误差
criterion = nn.SmoothL1Loss() # Huber损失
# 使用
predictions = model(x)
loss = criterion(predictions, targets)
CrossEntropyLoss详解
# CrossEntropyLoss = LogSoftmax + NLLLoss
# 所以模型输出不需要加Softmax!
logits = torch.randn(32, 10) # 模型输出(未归一化)
labels = torch.randint(0, 10, (32,)) # 标签
criterion = nn.CrossEntropyLoss()
loss = criterion(logits, labels)
# 等价于:
log_probs = F.log_softmax(logits, dim=1)
loss = F.nll_loss(log_probs, labels)
🔧 参数初始化
import torch.nn as nn
import torch.nn.init as init
# 方式1:对单个层初始化
layer = nn.Linear(10, 5)
init.xavier_uniform_(layer.weight)
init.zeros_(layer.bias)
# 方式2:对整个模型初始化
def init_weights(m):
if isinstance(m, nn.Linear):
init.xavier_uniform_(m.weight)
init.zeros_(m.bias)
elif isinstance(m, nn.Conv2d):
init.kaiming_normal_(m.weight)
model.apply(init_weights)
# 常用初始化方法
init.zeros_(tensor) # 全零
init.ones_(tensor) # 全一
init.constant_(tensor, val) # 常数
init.normal_(tensor) # 正态分布
init.uniform_(tensor) # 均匀分布
init.xavier_uniform_(tensor) # Xavier均匀
init.xavier_normal_(tensor) # Xavier正态
init.kaiming_uniform_(tensor)# Kaiming均匀
init.kaiming_normal_(tensor) # Kaiming正态
🎯 实战:构建一个完整的CNN
import torch
import torch.nn as nn
import torch.nn.functional as F
class CNN(nn.Module):
"""用于MNIST的简单CNN"""
def __init__(self):
super().__init__()
# 卷积层
self.conv1 = nn.Conv2d(1, 32, 3, padding=1) # 1→32通道
self.conv2 = nn.Conv2d(32, 64, 3, padding=1) # 32→64通道
self.pool = nn.MaxPool2d(2, 2) # 2x2池化
# 全连接层
self.fc1 = nn.Linear(64 * 7 * 7, 128)
self.fc2 = nn.Linear(128, 10)
# Dropout
self.dropout = nn.Dropout(0.5)
def forward(self, x):
# 输入: (batch, 1, 28, 28)
# 卷积块1
x = self.conv1(x) # (batch, 32, 28, 28)
x = F.relu(x)
x = self.pool(x) # (batch, 32, 14, 14)
# 卷积块2
x = self.conv2(x) # (batch, 64, 14, 14)
x = F.relu(x)
x = self.pool(x) # (batch, 64, 7, 7)
# 展平
x = x.view(-1, 64 * 7 * 7) # (batch, 3136)
# 全连接
x = F.relu(self.fc1(x)) # (batch, 128)
x = self.dropout(x)
x = self.fc2(x) # (batch, 10)
return x
# 创建模型
model = CNN()
# 测试
x = torch.randn(32, 1, 28, 28)
y = model(x)
print(f"输出形状: {y.shape}") # [32, 10]
# 查看参数量
total_params = sum(p.numel() for p in model.parameters())
print(f"参数量: {total_params:,}")
🆚 nn.functional vs nn.Module
| 特点 | nn.Module | nn.functional |
|---|---|---|
| 有参数 | ✅ | ❌ |
| 需要实例化 | ✅ | ❌ |
| 状态保存 | ✅ | ❌ |
| train/eval模式 | ✅ | ❌ |
# 有参数的层 → 必须用 nn.Module
self.conv = nn.Conv2d(...) # ✅
self.bn = nn.BatchNorm2d(...) # ✅
# 无参数的操作 → 推荐用 nn.functional
x = F.relu(x) # ✅ 更简洁
x = F.max_pool2d(x) # ✅
x = F.softmax(x) # ✅
# Dropout特殊:两种都可以,但注意train/eval模式
x = F.dropout(x, training=self.training) # 需要手动传training
x = self.dropout(x) # 自动处理training模式
🏋️ 练习
# 练习1:使用nn.Sequential构建一个多层感知机
# 结构:784 → 512 → 256 → 128 → 10
# 要求:每层后接ReLU和Dropout(0.3)
# 你的代码:
# 练习2:继承nn.Module实现同样的网络
# 你的代码:
# 练习3:自定义一个带残差连接的模块
# 你的代码:
点击查看答案
import torch
import torch.nn as nn
import torch.nn.functional as F
# 练习1:Sequential方式
mlp_sequential = nn.Sequential(
nn.Linear(784, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, 10)
)
# 练习2:Module方式
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(784, 512)
self.fc2 = nn.Linear(512, 256)
self.fc3 = nn.Linear(256, 128)
self.fc4 = nn.Linear(128, 10)
self.dropout = nn.Dropout(0.3)
def forward(self, x):
x = self.dropout(F.relu(self.fc1(x)))
x = self.dropout(F.relu(self.fc2(x)))
x = self.dropout(F.relu(self.fc3(x)))
x = self.fc4(x)
return x
# 练习3:残差模块
class ResidualBlock(nn.Module):
def __init__(self, dim):
super().__init__()
self.fc1 = nn.Linear(dim, dim)
self.fc2 = nn.Linear(dim, dim)
self.bn1 = nn.BatchNorm1d(dim)
self.bn2 = nn.BatchNorm1d(dim)
def forward(self, x):
identity = x # 保存输入
out = self.fc1(x)
out = self.bn1(out)
out = F.relu(out)
out = self.fc2(out)
out = self.bn2(out)
out = out + identity # 残差连接
out = F.relu(out)
return out
# 测试
x = torch.randn(32, 256)
block = ResidualBlock(256)
y = block(x)
print(f"输入形状: {x.shape}, 输出形状: {y.shape}")
下一步
理解了 torch.nn 后,接下来学习自动求导的原理,这是训练神经网络的基础!