🧩 torch.nn 快速入门

torch.nn 是PyTorch构建神经网络的核心模块。本节将帮助你理解它的设计理念和基本用法。

🤔 torch.nn 是什么？

torch.nn 提供了构建神经网络所需的所有组件：

import torch.nn as nn

# nn 包含：
# - 网络层：Linear, Conv2d, LSTM, Transformer...
# - 激活函数：ReLU, Sigmoid, Softmax...
# - 损失函数：CrossEntropyLoss, MSELoss...
# - 容器：Sequential, ModuleList, ModuleDict...
# - 工具：Dropout, BatchNorm, Embedding...

📦 nn.Module：神经网络的基石

所有神经网络模型都继承自 nn.Module：

import torch
import torch.nn as nn

class MyNetwork(nn.Module):
    def __init__(self):
        super().__init__()  # 必须调用父类初始化
        
        # 在这里定义网络的层
        self.layer1 = nn.Linear(10, 20)
        self.layer2 = nn.Linear(20, 5)
    
    def forward(self, x):
        # 定义前向传播逻辑
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.layer2(x)
        return x

# 创建模型
model = MyNetwork()
print(model)

为什么用 nn.Module？

# nn.Module 自动帮你做了很多事情：

# 1. 参数管理
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# 2. 设备移动
model.to('cuda')  # 所有参数自动移到GPU

# 3. 训练/评估模式切换
model.train()  # 训练模式
model.eval()   # 评估模式

# 4. 保存/加载
torch.save(model.state_dict(), 'model.pth')
model.load_state_dict(torch.load('model.pth'))

🧱 常用网络层

全连接层 (Linear)

import torch.nn as nn

# 创建全连接层：输入10维，输出5维
fc = nn.Linear(10, 5)

# 内部参数
print(f"权重形状: {fc.weight.shape}")  # [5, 10]
print(f"偏置形状: {fc.bias.shape}")    # [5]

# 使用
x = torch.randn(32, 10)  # 32个样本
y = fc(x)
print(f"输出形状: {y.shape}")  # [32, 5]

卷积层 (Conv2d)

# 2D卷积：1个输入通道，16个输出通道，3x3卷积核
conv = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)

x = torch.randn(32, 1, 28, 28)  # 32张 28x28 灰度图
y = conv(x)
print(f"输出形状: {y.shape}")  # [32, 16, 28, 28]

池化层 (Pooling)

# 最大池化
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)

x = torch.randn(32, 16, 28, 28)
y = maxpool(x)
print(f"输出形状: {y.shape}")  # [32, 16, 14, 14]  # 尺寸减半

# 平均池化
avgpool = nn.AvgPool2d(kernel_size=2)

# 自适应池化（输出固定大小）
adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))  # 输出 1x1

循环层 (LSTM/GRU)

# LSTM：输入10维，隐藏层20维，2层
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)

x = torch.randn(32, 15, 10)  # 32个样本，15个时间步，10维特征
output, (hidden, cell) = lstm(x)
print(f"输出形状: {output.shape}")  # [32, 15, 20]
print(f"隐藏状态: {hidden.shape}")  # [2, 32, 20]

Transformer

# Transformer编码器层
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)

x = torch.randn(10, 32, 512)  # 10个token，32个样本，512维
output = transformer_encoder(x)

🎨 激活函数

import torch
import torch.nn as nn
import torch.nn.functional as F

x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])

# 方式1：作为模块使用
relu_module = nn.ReLU()
y = relu_module(x)

# 方式2：作为函数使用（推荐）
y = F.relu(x)
y = torch.relu(x)  # 等价

# 常用激活函数
F.relu(x)       # max(0, x)
F.sigmoid(x)    # 1/(1+e^(-x))
F.tanh(x)       # 双曲正切
F.leaky_relu(x) # 带泄漏的ReLU
F.gelu(x)       # 高斯误差线性单元（Transformer常用）
F.softmax(x, dim=0)  # Softmax

何时用模块，何时用函数？

# 在 __init__ 中使用模块
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.relu = nn.ReLU()  # 作为模块定义
    
    def forward(self, x):
        # 两种方式都可以
        x = self.relu(x)  # 使用模块
        x = F.relu(x)     # 使用函数（更简洁）
        return x

# 推荐：无参数的激活函数用 F.xxx
# 有参数的层（如Linear）必须用模块

📊 正则化层

Dropout

dropout = nn.Dropout(p=0.5)  # 50%概率丢弃

# 训练时随机丢弃
model.train()
y = dropout(x)  # 部分值变为0

# 评估时不丢弃
model.eval()
y = dropout(x)  # 所有值保留（会自动缩放）

BatchNorm

# 对每个特征进行归一化
bn1d = nn.BatchNorm1d(num_features=100)  # 用于全连接层
bn2d = nn.BatchNorm2d(num_features=64)   # 用于卷积层

x = torch.randn(32, 64, 28, 28)  # 卷积特征图
y = bn2d(x)

# BatchNorm也有训练/评估模式的区别
model.train()  # 使用batch统计量
model.eval()   # 使用运行时统计量

LayerNorm

# LayerNorm：对每个样本归一化（Transformer常用）
layer_norm = nn.LayerNorm(normalized_shape=512)

x = torch.randn(32, 10, 512)  # 32个样本，10个token，512维
y = layer_norm(x)

🔗 容器：组织网络层

nn.Sequential

# 最简单的方式：按顺序堆叠层
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(128, 10)
)

# 使用
x = torch.randn(32, 784)
y = model(x)  # 依次通过所有层

nn.ModuleList

# 当你需要动态数量的层时
class DynamicNet(nn.Module):
    def __init__(self, layer_sizes):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.Linear(layer_sizes[i], layer_sizes[i+1])
            for i in range(len(layer_sizes)-1)
        ])
    
    def forward(self, x):
        for layer in self.layers[:-1]:
            x = F.relu(layer(x))
        x = self.layers[-1](x)  # 最后一层不加激活
        return x

model = DynamicNet([784, 512, 256, 128, 10])

nn.ModuleDict

# 用字典组织层
class MultiHeadModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = nn.Linear(784, 256)
        self.heads = nn.ModuleDict({
            'classification': nn.Linear(256, 10),
            'regression': nn.Linear(256, 1),
            'embedding': nn.Linear(256, 128)
        })
    
    def forward(self, x, task='classification'):
        x = F.relu(self.backbone(x))
        return self.heads[task](x)

model = MultiHeadModel()
y = model(x, task='classification')

⚠️ 为什么必须用 ModuleList/ModuleDict？

# ❌ 错误：普通list不会注册参数
self.layers = [nn.Linear(10, 10) for _ in range(3)]
# model.parameters() 不会包含这些层！

# ✅ 正确：使用ModuleList
self.layers = nn.ModuleList([nn.Linear(10, 10) for _ in range(3)])

📉 损失函数

import torch.nn as nn

# 分类损失
criterion = nn.CrossEntropyLoss()    # 多分类
criterion = nn.BCEWithLogitsLoss()   # 二分类
criterion = nn.NLLLoss()             # 负对数似然

# 回归损失
criterion = nn.MSELoss()             # 均方误差
criterion = nn.L1Loss()              # 平均绝对误差
criterion = nn.SmoothL1Loss()        # Huber损失

# 使用
predictions = model(x)
loss = criterion(predictions, targets)

CrossEntropyLoss详解

# CrossEntropyLoss = LogSoftmax + NLLLoss
# 所以模型输出不需要加Softmax！

logits = torch.randn(32, 10)  # 模型输出（未归一化）
labels = torch.randint(0, 10, (32,))  # 标签

criterion = nn.CrossEntropyLoss()
loss = criterion(logits, labels)

# 等价于：
log_probs = F.log_softmax(logits, dim=1)
loss = F.nll_loss(log_probs, labels)

🔧 参数初始化

import torch.nn as nn
import torch.nn.init as init

# 方式1：对单个层初始化
layer = nn.Linear(10, 5)
init.xavier_uniform_(layer.weight)
init.zeros_(layer.bias)

# 方式2：对整个模型初始化
def init_weights(m):
    if isinstance(m, nn.Linear):
        init.xavier_uniform_(m.weight)
        init.zeros_(m.bias)
    elif isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight)

model.apply(init_weights)

# 常用初始化方法
init.zeros_(tensor)          # 全零
init.ones_(tensor)           # 全一
init.constant_(tensor, val)  # 常数
init.normal_(tensor)         # 正态分布
init.uniform_(tensor)        # 均匀分布
init.xavier_uniform_(tensor) # Xavier均匀
init.xavier_normal_(tensor)  # Xavier正态
init.kaiming_uniform_(tensor)# Kaiming均匀
init.kaiming_normal_(tensor) # Kaiming正态

🎯 实战：构建一个完整的CNN

import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    """用于MNIST的简单CNN"""
    def __init__(self):
        super().__init__()
        
        # 卷积层
        self.conv1 = nn.Conv2d(1, 32, 3, padding=1)  # 1→32通道
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1) # 32→64通道
        self.pool = nn.MaxPool2d(2, 2)               # 2x2池化
        
        # 全连接层
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        
        # Dropout
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # 输入: (batch, 1, 28, 28)
        
        # 卷积块1
        x = self.conv1(x)           # (batch, 32, 28, 28)
        x = F.relu(x)
        x = self.pool(x)            # (batch, 32, 14, 14)
        
        # 卷积块2
        x = self.conv2(x)           # (batch, 64, 14, 14)
        x = F.relu(x)
        x = self.pool(x)            # (batch, 64, 7, 7)
        
        # 展平
        x = x.view(-1, 64 * 7 * 7)  # (batch, 3136)
        
        # 全连接
        x = F.relu(self.fc1(x))     # (batch, 128)
        x = self.dropout(x)
        x = self.fc2(x)             # (batch, 10)
        
        return x

# 创建模型
model = CNN()

# 测试
x = torch.randn(32, 1, 28, 28)
y = model(x)
print(f"输出形状: {y.shape}")  # [32, 10]

# 查看参数量
total_params = sum(p.numel() for p in model.parameters())
print(f"参数量: {total_params:,}")

🆚 nn.functional vs nn.Module

特点	nn.Module	nn.functional
有参数	✅	❌
需要实例化	✅	❌
状态保存	✅	❌
train/eval模式	✅	❌

# 有参数的层 → 必须用 nn.Module
self.conv = nn.Conv2d(...)   # ✅
self.bn = nn.BatchNorm2d(...)  # ✅

# 无参数的操作 → 推荐用 nn.functional
x = F.relu(x)        # ✅ 更简洁
x = F.max_pool2d(x)  # ✅
x = F.softmax(x)     # ✅

# Dropout特殊：两种都可以，但注意train/eval模式
x = F.dropout(x, training=self.training)  # 需要手动传training
x = self.dropout(x)  # 自动处理training模式

🏋️ 练习

# 练习1：使用nn.Sequential构建一个多层感知机
# 结构：784 → 512 → 256 → 128 → 10
# 要求：每层后接ReLU和Dropout(0.3)
# 你的代码：


# 练习2：继承nn.Module实现同样的网络
# 你的代码：


# 练习3：自定义一个带残差连接的模块
# 你的代码：

点击查看答案

import torch
import torch.nn as nn
import torch.nn.functional as F

# 练习1：Sequential方式
mlp_sequential = nn.Sequential(
    nn.Linear(784, 512),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(512, 256),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(128, 10)
)

# 练习2：Module方式
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 10)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        x = self.fc4(x)
        return x

# 练习3：残差模块
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.fc1 = nn.Linear(dim, dim)
        self.fc2 = nn.Linear(dim, dim)
        self.bn1 = nn.BatchNorm1d(dim)
        self.bn2 = nn.BatchNorm1d(dim)
    
    def forward(self, x):
        identity = x  # 保存输入
        
        out = self.fc1(x)
        out = self.bn1(out)
        out = F.relu(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        
        out = out + identity  # 残差连接
        out = F.relu(out)
        
        return out

# 测试
x = torch.randn(32, 256)
block = ResidualBlock(256)
y = block(x)
print(f"输入形状: {x.shape}, 输出形状: {y.shape}")

下一步

理解了 torch.nn 后，接下来学习自动求导的原理，这是训练神经网络的基础！