3. 网络层与损失函数

网络层¶

In [2]:

Copied!

import torch
import torch.nn as nn
import torch
import torch.nn as nn

In [3]:

Copied!





# torch.nn.Module: 所有神经网络模块的基类，自定义模型须继承该类并实现 forward() 方法
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(4, 2)

    def forward(self, x):
        return self.fc(x)

model = MyModel()
x = torch.randn(3, 4)
print(model(x).shape)   # torch.Size([3, 2])
# torch.nn.Module: 所有神经网络模块的基类，自定义模型须继承该类并实现 forward() 方法
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = nn.Linear(4, 2)

    def forward(self, x):
        return self.fc(x)

model = MyModel()
x = torch.randn(3, 4)
print(model(x).shape)   # torch.Size([3, 2])

torch.Size([3, 2])

In [4]:

Copied!





# torch.nn.Linear: 全连接层，对输入做仿射变换 y = xW^T + b，常用于 MLP 和特征映射
fc = nn.Linear(in_features=8, out_features=4)
x = torch.randn(2, 8)         # batch_size=2, 输入维度=8
print(fc(x).shape)             # torch.Size([2, 4])
print(fc.weight.shape)         # torch.Size([4, 8])  权重矩阵
print(fc.bias.shape)           # torch.Size([4])
# torch.nn.Linear: 全连接层，对输入做仿射变换 y = xW^T + b，常用于 MLP 和特征映射
fc = nn.Linear(in_features=8, out_features=4)
x = torch.randn(2, 8)         # batch_size=2, 输入维度=8
print(fc(x).shape)             # torch.Size([2, 4])
print(fc.weight.shape)         # torch.Size([4, 8])  权重矩阵
print(fc.bias.shape)           # torch.Size([4])

torch.Size([2, 4])
torch.Size([4, 8])
torch.Size([4])

In [5]:

Copied!





# torch.nn.Embedding: 将离散 ID（整数）映射为稠密向量，本质是可学习的查找表，推荐系统核心组件
emb = nn.Embedding(num_embeddings=10, embedding_dim=4)  # 词表大小10，向量维度4
ids = torch.tensor([0, 3, 7, 3])                         # 输入 token ID
print(emb(ids))
print(emb(ids).shape)          # torch.Size([4, 4])
print(emb.weight.shape)        # torch.Size([10, 4])  整张 embedding 表
# torch.nn.Embedding: 将离散 ID（整数）映射为稠密向量，本质是可学习的查找表，推荐系统核心组件
emb = nn.Embedding(num_embeddings=10, embedding_dim=4)  # 词表大小10，向量维度4
ids = torch.tensor([0, 3, 7, 3])                         # 输入 token ID
print(emb(ids))
print(emb(ids).shape)          # torch.Size([4, 4])
print(emb.weight.shape)        # torch.Size([10, 4])  整张 embedding 表

tensor([[ 0.2682, -0.4815,  0.2506, -0.1872],
        [-0.4115, -1.5965, -0.8434, -1.3801],
        [-1.2828, -0.8709,  0.4438,  1.0211],
        [-0.4115, -1.5965, -0.8434, -1.3801]], grad_fn=<EmbeddingBackward0>)
torch.Size([4, 4])
torch.Size([10, 4])

In [ ]:

Copied!





# torch.nn.Identity: 占位层，输出与输入完全相同，常用于条件性跳过某层或作为消融实验的替换层
layer = nn.Identity()
x = torch.randn(3, 5)
print(x)
print(layer(x))
print(torch.equal(layer(x), x))   # True
# torch.nn.Identity: 占位层，输出与输入完全相同，常用于条件性跳过某层或作为消融实验的替换层
layer = nn.Identity()
x = torch.randn(3, 5)
print(x)
print(layer(x))
print(torch.equal(layer(x), x))   # True

tensor([[ 0.1399,  0.0206, -0.7069, -1.1237,  0.7433],
        [-0.9603, -0.6889,  0.6085,  0.3973,  2.3722],
        [ 0.9137, -1.8052,  1.0280, -1.2020, -2.0843]])
tensor([[ 0.1399,  0.0206, -0.7069, -1.1237,  0.7433],
        [-0.9603, -0.6889,  0.6085,  0.3973,  2.3722],
        [ 0.9137, -1.8052,  1.0280, -1.2020, -2.0843]])
True

In [7]:

Copied!





# torch.nn.Conv1d: 一维卷积，常用于序列/文本特征提取，输入形状 (N, C_in, L)
conv1d = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
x = torch.randn(2, 16, 50)       # (batch=2, channels=16, length=50)
print(conv1d(x).shape)            # torch.Size([2, 32, 50])


# torch.nn.Conv2d: 二维卷积，图像特征提取的核心层，输入形状 (N, C_in, H, W)
conv2d = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
x = torch.randn(2, 3, 32, 32)    # (batch=2, RGB=3, H=32, W=32)
print(conv2d(x).shape)            # torch.Size([2, 64, 32, 32])


# torch.nn.Conv3d: 三维卷积，用于视频或医学影像等体积数据，输入形状 (N, C_in, D, H, W)
conv3d = nn.Conv3d(in_channels=1, out_channels=8, kernel_size=3, padding=1)
x = torch.randn(1, 1, 8, 16, 16)
print(conv3d(x).shape)            # torch.Size([1, 8, 8, 16, 16])
# torch.nn.Conv1d: 一维卷积，常用于序列/文本特征提取，输入形状 (N, C_in, L)
conv1d = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
x = torch.randn(2, 16, 50)       # (batch=2, channels=16, length=50)
print(conv1d(x).shape)            # torch.Size([2, 32, 50])


# torch.nn.Conv2d: 二维卷积，图像特征提取的核心层，输入形状 (N, C_in, H, W)
conv2d = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1)
x = torch.randn(2, 3, 32, 32)    # (batch=2, RGB=3, H=32, W=32)
print(conv2d(x).shape)            # torch.Size([2, 64, 32, 32])


# torch.nn.Conv3d: 三维卷积，用于视频或医学影像等体积数据，输入形状 (N, C_in, D, H, W)
conv3d = nn.Conv3d(in_channels=1, out_channels=8, kernel_size=3, padding=1)
x = torch.randn(1, 1, 8, 16, 16)
print(conv3d(x).shape)            # torch.Size([1, 8, 8, 16, 16])

torch.Size([2, 32, 50])
torch.Size([2, 64, 32, 32])
torch.Size([1, 8, 8, 16, 16])

In [8]:

Copied!





# torch.nn.ConvTranspose2d: 转置卷积（有时称反卷积），用于从低分辨率特征图生成更高分辨率特征图，
# 常见于图像生成（GAN）、语义分割（U-Net 上采样路径）等任务
up = nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=2, stride=2)
x = torch.randn(2, 64, 16, 16)   # 低分辨率特征图
print(up(x).shape)                # torch.Size([2, 32, 32, 32])  空间分辨率翻倍
# torch.nn.ConvTranspose2d: 转置卷积（有时称反卷积），用于从低分辨率特征图生成更高分辨率特征图，
# 常见于图像生成（GAN）、语义分割（U-Net 上采样路径）等任务
up = nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=2, stride=2)
x = torch.randn(2, 64, 16, 16)   # 低分辨率特征图
print(up(x).shape)                # torch.Size([2, 32, 32, 32])  空间分辨率翻倍

torch.Size([2, 32, 32, 32])

BatchNorm： → 对每个通道，跨 batch 维度统计后进行归一化

LayerNorm： → 对每个样本，在指定特征维度上整体归一化

GroupNorm： → 对每个样本，将通道划分为若干组，在组内归一化

In [9]:

Copied!





# torch.nn.BatchNorm1d: 对小批量的一维（序列/特征）数据进行批标准化，减少内部协变量偏移
bn1d = nn.BatchNorm1d(num_features=16)
x = torch.randn(8, 16)            # (batch=8, features=16)
print(bn1d(x).shape)              # torch.Size([8, 16])


# torch.nn.BatchNorm2d: 对小批量的二维（图像）数据沿通道维度进行批标准化
bn2d = nn.BatchNorm2d(num_features=64)
x = torch.randn(4, 64, 8, 8)     # (N, C, H, W)
print(bn2d(x).shape)              # torch.Size([4, 64, 8, 8])


# torch.nn.BatchNorm3d: 对小批量的三维体积数据沿通道维度进行批标准化
bn3d = nn.BatchNorm3d(num_features=8)
x = torch.randn(2, 8, 4, 4, 4)
print(bn3d(x).shape)              # torch.Size([2, 8, 4, 4, 4])
# torch.nn.BatchNorm1d: 对小批量的一维（序列/特征）数据进行批标准化，减少内部协变量偏移
bn1d = nn.BatchNorm1d(num_features=16)
x = torch.randn(8, 16)            # (batch=8, features=16)
print(bn1d(x).shape)              # torch.Size([8, 16])


# torch.nn.BatchNorm2d: 对小批量的二维（图像）数据沿通道维度进行批标准化
bn2d = nn.BatchNorm2d(num_features=64)
x = torch.randn(4, 64, 8, 8)     # (N, C, H, W)
print(bn2d(x).shape)              # torch.Size([4, 64, 8, 8])


# torch.nn.BatchNorm3d: 对小批量的三维体积数据沿通道维度进行批标准化
bn3d = nn.BatchNorm3d(num_features=8)
x = torch.randn(2, 8, 4, 4, 4)
print(bn3d(x).shape)              # torch.Size([2, 8, 4, 4, 4])

torch.Size([8, 16])
torch.Size([4, 64, 8, 8])
torch.Size([2, 8, 4, 4, 4])

In [ ]:

Copied!





# torch.nn.GroupNorm: 分组归一化，将通道分成若干组后在组内归一化，不依赖 batch size，适合小 batch 场景
gn = nn.GroupNorm(num_groups=4, num_channels=16)
x = torch.randn(2, 16, 8, 8)     # (N, C, H, W)
print(gn(x).shape)                # torch.Size([2, 16, 8, 8])

# torch.nn.LayerNorm: 层归一化，对每个样本独立地在指定维度上归一化，Transformer 的标配
ln = nn.LayerNorm(normalized_shape=16)
x = torch.randn(4, 10, 16)        # (batch, seq_len, d_model)
print(ln(x).shape)                # torch.Size([4, 10, 16])
# torch.nn.GroupNorm: 分组归一化，将通道分成若干组后在组内归一化，不依赖 batch size，适合小 batch 场景
gn = nn.GroupNorm(num_groups=4, num_channels=16)
x = torch.randn(2, 16, 8, 8)     # (N, C, H, W)
print(gn(x).shape)                # torch.Size([2, 16, 8, 8])

# torch.nn.LayerNorm: 层归一化，对每个样本独立地在指定维度上归一化，Transformer 的标配
ln = nn.LayerNorm(normalized_shape=16)
x = torch.randn(4, 10, 16)        # (batch, seq_len, d_model)
print(ln(x).shape)                # torch.Size([4, 10, 16])

torch.Size([2, 16, 8, 8])
torch.Size([4, 10, 16])

In [11]:

Copied!





# torch.nn.SyncBatchNorm: 跨多个 GPU 设备同步统计量的批归一化，用于多卡分布式训练以保证一致性
# （通常通过 nn.SyncBatchNorm.convert_sync_batchnorm(model) 将普通 BN 转换）
sync_bn = nn.SyncBatchNorm(num_features=16)
# 在分布式环境下使用，单卡等价于 BatchNorm1d
x = torch.randn(4, 16)
print(sync_bn(x).shape)           # torch.Size([4, 16])


# torch.nn.LocalResponseNorm: 局部响应归一化（LRN），在相邻通道间做归一化，来自早期 AlexNet
lrn = nn.LocalResponseNorm(size=5)  # 在相邻 5 个通道内归一化
x = torch.randn(2, 16, 8, 8)
print(lrn(x).shape)               # torch.Size([2, 16, 8, 8])
# torch.nn.SyncBatchNorm: 跨多个 GPU 设备同步统计量的批归一化，用于多卡分布式训练以保证一致性
# （通常通过 nn.SyncBatchNorm.convert_sync_batchnorm(model) 将普通 BN 转换）
sync_bn = nn.SyncBatchNorm(num_features=16)
# 在分布式环境下使用，单卡等价于 BatchNorm1d
x = torch.randn(4, 16)
print(sync_bn(x).shape)           # torch.Size([4, 16])


# torch.nn.LocalResponseNorm: 局部响应归一化（LRN），在相邻通道间做归一化，来自早期 AlexNet
lrn = nn.LocalResponseNorm(size=5)  # 在相邻 5 个通道内归一化
x = torch.randn(2, 16, 8, 8)
print(lrn(x).shape)               # torch.Size([2, 16, 8, 8])

torch.Size([4, 16])
torch.Size([2, 16, 8, 8])

In [12]:

Copied!





# torch.nn.ReLU: 修正线性单元，逐元素计算 max(0, x)，最常用激活函数，缓解梯度消失
relu = nn.ReLU()
x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
print(relu(x))    # tensor([0., 0., 0., 1., 2.])


# torch.nn.Sigmoid: 将任意实数压缩到 (0, 1)，常用于二分类输出层或注意力门控
sigmoid = nn.Sigmoid()
print(sigmoid(x))  # tensor([0.1192, 0.2689, 0.5000, 0.7311, 0.8808])


# torch.nn.Tanh: 双曲正切函数，将输入压缩到 (-1, 1)，常用于 RNN 隐藏层或特征归一化
tanh = nn.Tanh()
print(tanh(x))    # tensor([-0.9640, -0.7616,  0.0000,  0.7616,  0.9640])
# torch.nn.ReLU: 修正线性单元，逐元素计算 max(0, x)，最常用激活函数，缓解梯度消失
relu = nn.ReLU()
x = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
print(relu(x))    # tensor([0., 0., 0., 1., 2.])


# torch.nn.Sigmoid: 将任意实数压缩到 (0, 1)，常用于二分类输出层或注意力门控
sigmoid = nn.Sigmoid()
print(sigmoid(x))  # tensor([0.1192, 0.2689, 0.5000, 0.7311, 0.8808])


# torch.nn.Tanh: 双曲正切函数，将输入压缩到 (-1, 1)，常用于 RNN 隐藏层或特征归一化
tanh = nn.Tanh()
print(tanh(x))    # tensor([-0.9640, -0.7616,  0.0000,  0.7616,  0.9640])

tensor([0., 0., 0., 1., 2.])
tensor([0.1192, 0.2689, 0.5000, 0.7311, 0.8808])
tensor([-0.9640, -0.7616,  0.0000,  0.7616,  0.9640])

In [13]:

Copied!





# torch.nn.MaxPool2d: 二维最大池化层，在每个窗口内取最大值，保留显著特征，常用于 CNN 下采样
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
x = torch.randn(2, 64, 32, 32)
print(maxpool(x).shape)           # torch.Size([2, 64, 16, 16])  空间尺寸减半


# torch.nn.AvgPool2d: 二维平均池化层，在每个窗口内取平均值，特征更平滑，常用于全局平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
print(avgpool(x).shape)           # torch.Size([2, 64, 16, 16])
# torch.nn.MaxPool2d: 二维最大池化层，在每个窗口内取最大值，保留显著特征，常用于 CNN 下采样
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
x = torch.randn(2, 64, 32, 32)
print(maxpool(x).shape)           # torch.Size([2, 64, 16, 16])  空间尺寸减半


# torch.nn.AvgPool2d: 二维平均池化层，在每个窗口内取平均值，特征更平滑，常用于全局平均池化
avgpool = nn.AvgPool2d(kernel_size=2, stride=2)
print(avgpool(x).shape)           # torch.Size([2, 64, 16, 16])

torch.Size([2, 64, 16, 16])
torch.Size([2, 64, 16, 16])

In [14]:

Copied!





# torch.nn.AdaptiveMaxPool2d: 二维自适应最大池化，自动计算步长使输出为指定空间尺寸，无需手动计算 kernel_size
ada_max = nn.AdaptiveMaxPool2d(output_size=(4, 4))
x = torch.randn(2, 64, 13, 13)   # 任意输入尺寸
print(ada_max(x).shape)           # torch.Size([2, 64, 4, 4])


# torch.nn.AdaptiveAvgPool2d: 二维自适应平均池化，当 output_size=(1,1) 时等价于全局平均池化（GAP）
ada_avg = nn.AdaptiveAvgPool2d(output_size=(1, 1))
print(ada_avg(x).shape)           # torch.Size([2, 64, 1, 1])  每通道全局均值
# torch.nn.AdaptiveMaxPool2d: 二维自适应最大池化，自动计算步长使输出为指定空间尺寸，无需手动计算 kernel_size
ada_max = nn.AdaptiveMaxPool2d(output_size=(4, 4))
x = torch.randn(2, 64, 13, 13)   # 任意输入尺寸
print(ada_max(x).shape)           # torch.Size([2, 64, 4, 4])


# torch.nn.AdaptiveAvgPool2d: 二维自适应平均池化，当 output_size=(1,1) 时等价于全局平均池化（GAP）
ada_avg = nn.AdaptiveAvgPool2d(output_size=(1, 1))
print(ada_avg(x).shape)           # torch.Size([2, 64, 1, 1])  每通道全局均值

torch.Size([2, 64, 4, 4])
torch.Size([2, 64, 1, 1])

In [15]:

Copied!





# torch.nn.Dropout: 训练时以概率 p 随机将神经元置零，推理时自动关闭，防止过拟合
dropout = nn.Dropout(p=0.5)
x = torch.ones(2, 10)
print(dropout(x))                 # 约一半元素被置零（训练模式下）


# torch.nn.Dropout2d: 以概率 p 随机将整个通道（特征图）置零，适用于二维卷积特征
dropout2d = nn.Dropout2d(p=0.3)
x = torch.ones(2, 8, 4, 4)
print((dropout2d(x) == 0).any())  # True，有整通道被置零


# torch.nn.Dropout3d: 以概率 p 随机将三维体积数据的整个通道置零，适用于三维卷积特征
dropout3d = nn.Dropout3d(p=0.3)
x = torch.ones(2, 8, 4, 4, 4)
print(dropout3d(x).shape)         # torch.Size([2, 8, 4, 4, 4])
# torch.nn.Dropout: 训练时以概率 p 随机将神经元置零，推理时自动关闭，防止过拟合
dropout = nn.Dropout(p=0.5)
x = torch.ones(2, 10)
print(dropout(x))                 # 约一半元素被置零（训练模式下）


# torch.nn.Dropout2d: 以概率 p 随机将整个通道（特征图）置零，适用于二维卷积特征
dropout2d = nn.Dropout2d(p=0.3)
x = torch.ones(2, 8, 4, 4)
print((dropout2d(x) == 0).any())  # True，有整通道被置零


# torch.nn.Dropout3d: 以概率 p 随机将三维体积数据的整个通道置零，适用于三维卷积特征
dropout3d = nn.Dropout3d(p=0.3)
x = torch.ones(2, 8, 4, 4, 4)
print(dropout3d(x).shape)         # torch.Size([2, 8, 4, 4, 4])

tensor([[2., 2., 0., 0., 0., 0., 2., 0., 0., 0.],
        [2., 2., 2., 2., 0., 2., 2., 0., 2., 0.]])
tensor(True)
torch.Size([2, 8, 4, 4, 4])

一、为什么普通 Tensor 不行？

看下面两种写法的区别：

错误写法：

self.weight = torch.randn(dim, requires_grad=True)

虽然它能算梯度，但：

它不会出现在 model.parameters() 里。

优化器看不到它。

因此不会被更新。

———————————— 二、Parameter 做了什么？

self.weight = nn.Parameter(torch.randn(dim))

它做了两件事：

1）自动设置 requires_grad=True 2）注册到模块内部参数列表中

所以：

会出现在 model.parameters()
会被 optimizer 更新
会被 state_dict 保存

三、实践中为什么“感觉没用到”？

因为你一直在用。

例如：

nn.Linear
nn.Conv2d
nn.LayerNorm

它们内部的 weight 和 bias，都是 nn.Parameter。

只是你没有手动写。

四、什么时候需要手动使用？

当你想定义“自定义可学习变量”时。

例如：

1）自定义缩放因子 2）可学习温度参数 3）自定义 attention 权重 4）门控参数

例如：

self.temperature = nn.Parameter(torch.tensor(1.0))

否则它只是普通张量，不会更新。

五、和梯度机制的关系

是否计算梯度，取决于：

requires_grad=True

是否参与优化器更新，取决于：

是否在 model.parameters() 中。

Parameter 解决的是第二个问题。

六、本质总结

Parameter 不是“开启梯度”。

而是：

告诉 PyTorch：这是模型的组成部分，应当被训练。

———————————— 七、一句话精确总结

nn.Parameter 的作用是：

“将一个张量声明为模型的可学习权重，使其被自动注册、保存和优化。”

它是模型结构管理工具，而不仅仅是梯度工具。

In [16]:

Copied!





# torch.nn.Parameter: 将张量包装为模块参数，注册后会出现在 model.parameters() 中并参与梯度更新
class CustomLayer(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(dim))  # 可学习参数

    def forward(self, x):
        return x * self.weight

layer = CustomLayer(4)
print(list(layer.parameters()))   # [Parameter containing: tensor([...], requires_grad=True)]
# torch.nn.Parameter: 将张量包装为模块参数，注册后会出现在 model.parameters() 中并参与梯度更新
class CustomLayer(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(dim))  # 可学习参数

    def forward(self, x):
        return x * self.weight

layer = CustomLayer(4)
print(list(layer.parameters()))   # [Parameter containing: tensor([...], requires_grad=True)]

[Parameter containing:
tensor([ 0.4569,  1.1525, -0.4448, -0.8808], requires_grad=True)]

In [17]:

Copied!





# torch.nn.ParameterList: 以列表方式存储多个 nn.Parameter，支持按索引访问，参数会被正确注册
class MultiHeadWeights(nn.Module):
    def __init__(self, n_heads, dim):
        super().__init__()
        self.weights = nn.ParameterList(
            [nn.Parameter(torch.randn(dim, dim)) for _ in range(n_heads)]
        )

m = MultiHeadWeights(4, 8)
print(len(list(m.parameters())))  # 4


# torch.nn.ParameterDict: 以字典方式存储多个 nn.Parameter，支持按名称访问，参数会被正确注册
class GatedLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.params = nn.ParameterDict({
            'gate': nn.Parameter(torch.randn(8)),
            'bias': nn.Parameter(torch.zeros(8))
        })

g = GatedLayer()
print(g.params['gate'].shape)     # torch.Size([8])
# torch.nn.ParameterList: 以列表方式存储多个 nn.Parameter，支持按索引访问，参数会被正确注册
class MultiHeadWeights(nn.Module):
    def __init__(self, n_heads, dim):
        super().__init__()
        self.weights = nn.ParameterList(
            [nn.Parameter(torch.randn(dim, dim)) for _ in range(n_heads)]
        )

m = MultiHeadWeights(4, 8)
print(len(list(m.parameters())))  # 4


# torch.nn.ParameterDict: 以字典方式存储多个 nn.Parameter，支持按名称访问，参数会被正确注册
class GatedLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.params = nn.ParameterDict({
            'gate': nn.Parameter(torch.randn(8)),
            'bias': nn.Parameter(torch.zeros(8))
        })

g = GatedLayer()
print(g.params['gate'].shape)     # torch.Size([8])

4
torch.Size([8])

In [18]:

Copied!





# torch.nn.Unfold: 将输入图像按滑动窗口展开为列（im2col），是卷积运算的底层实现基础
unfold = nn.Unfold(kernel_size=3, stride=1, padding=1)
x = torch.randn(1, 3, 8, 8)      # (N, C, H, W)
out = unfold(x)
print(out.shape)                  # torch.Size([1, 27, 64])  27=3*3*3通道, 64=8*8个窗口


# torch.nn.Fold: Unfold 的逆操作，将展开的列重新组合回特征图（叠加重叠区域）
fold = nn.Fold(output_size=(8, 8), kernel_size=3, stride=1, padding=1)
restored = fold(out)
print(restored.shape)             # torch.Size([1, 3, 8, 8])
# torch.nn.Unfold: 将输入图像按滑动窗口展开为列（im2col），是卷积运算的底层实现基础
unfold = nn.Unfold(kernel_size=3, stride=1, padding=1)
x = torch.randn(1, 3, 8, 8)      # (N, C, H, W)
out = unfold(x)
print(out.shape)                  # torch.Size([1, 27, 64])  27=3*3*3通道, 64=8*8个窗口


# torch.nn.Fold: Unfold 的逆操作，将展开的列重新组合回特征图（叠加重叠区域）
fold = nn.Fold(output_size=(8, 8), kernel_size=3, stride=1, padding=1)
restored = fold(out)
print(restored.shape)             # torch.Size([1, 3, 8, 8])

torch.Size([1, 27, 64])
torch.Size([1, 3, 8, 8])

In [19]:

Copied!





# torch.nn.Sequential: 有序容器，模块按传入构造函数的顺序依次前向执行，适合构建简单线性网络
model = nn.Sequential(
    nn.Linear(16, 64),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64, 10)
)
x = torch.randn(4, 16)
print(model(x).shape)             # torch.Size([4, 10])
# torch.nn.Sequential: 有序容器，模块按传入构造函数的顺序依次前向执行，适合构建简单线性网络
model = nn.Sequential(
    nn.Linear(16, 64),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64, 10)
)
x = torch.randn(4, 16)
print(model(x).shape)             # torch.Size([4, 10])

torch.Size([4, 10])

nn.Parameter 解决的是“一个张量是否作为模型可训练权重被注册和更新”的问题；ParameterList/ParameterDict 是在此基础上，用列表或字典的形式管理多个独立参数，适用于你需要手动定义一组可学习矩阵或向量（例如多头独立权重、可学习温度、门控系数）的场景。而 ModuleList/ModuleDict 管理的不是张量，而是“完整子网络结构”（如 Linear、Conv、Attention 等模块），用于构建可动态遍历或按名称选择的网络结构，例如多层堆叠网络、MoE 专家网络、多分支条件计算图等。前者解决“权重注册问题”，后者解决“子结构注册与管理问题”；本质都是为了让模型结构被 PyTorch 正确追踪、保存、迁移和优化，但管理层级不同——Parameter 管权重，Module 管结构。

In [20]:

Copied!





# torch.nn.ModuleList: 以列表方式存储子模块，参数会被正确注册，支持按索引访问和动态遍历
layers = nn.ModuleList([nn.Linear(8, 8) for _ in range(4)])
x = torch.randn(2, 8)
for layer in layers:
    x = layer(x)
print(x.shape)                    # torch.Size([2, 8])


# torch.nn.ModuleDict: 以字典方式存储子模块，支持按名称访问，适合条件分支网络结构
experts = nn.ModuleDict({
    'expert_a': nn.Linear(8, 4),
    'expert_b': nn.Linear(8, 4),
})
x = torch.randn(2, 8)
print(experts['expert_a'](x).shape)  # torch.Size([2, 4])
# torch.nn.ModuleList: 以列表方式存储子模块，参数会被正确注册，支持按索引访问和动态遍历
layers = nn.ModuleList([nn.Linear(8, 8) for _ in range(4)])
x = torch.randn(2, 8)
for layer in layers:
    x = layer(x)
print(x.shape)                    # torch.Size([2, 8])


# torch.nn.ModuleDict: 以字典方式存储子模块，支持按名称访问，适合条件分支网络结构
experts = nn.ModuleDict({
    'expert_a': nn.Linear(8, 4),
    'expert_b': nn.Linear(8, 4),
})
x = torch.randn(2, 8)
print(experts['expert_a'](x).shape)  # torch.Size([2, 4])

torch.Size([2, 8])
torch.Size([2, 4])

In [21]:

Copied!





# torch.nn.RNN: 基础循环神经网络，对序列逐步处理隐藏状态，长序列存在梯度消失问题
rnn = nn.RNN(input_size=8, hidden_size=16, num_layers=2, batch_first=True)
x = torch.randn(4, 10, 8)         # (batch=4, seq_len=10, input=8)
out, h_n = rnn(x)
print(out.shape, h_n.shape)       # torch.Size([4, 10, 16]) torch.Size([2, 4, 16])


# torch.nn.LSTM: 长短期记忆网络，引入遗忘/输入/输出门控制信息流，有效缓解长序列梯度消失
lstm = nn.LSTM(input_size=8, hidden_size=16, num_layers=2, batch_first=True)
out, (h_n, c_n) = lstm(x)
print(out.shape, h_n.shape)       # torch.Size([4, 10, 16]) torch.Size([2, 4, 16])


# torch.nn.GRU: 门控循环单元，参数量少于 LSTM，实践中效果相当，训练更快
gru = nn.GRU(input_size=8, hidden_size=16, num_layers=2, batch_first=True)
out, h_n = gru(x)
print(out.shape)                   # torch.Size([4, 10, 16])
# torch.nn.RNN: 基础循环神经网络，对序列逐步处理隐藏状态，长序列存在梯度消失问题
rnn = nn.RNN(input_size=8, hidden_size=16, num_layers=2, batch_first=True)
x = torch.randn(4, 10, 8)         # (batch=4, seq_len=10, input=8)
out, h_n = rnn(x)
print(out.shape, h_n.shape)       # torch.Size([4, 10, 16]) torch.Size([2, 4, 16])


# torch.nn.LSTM: 长短期记忆网络，引入遗忘/输入/输出门控制信息流，有效缓解长序列梯度消失
lstm = nn.LSTM(input_size=8, hidden_size=16, num_layers=2, batch_first=True)
out, (h_n, c_n) = lstm(x)
print(out.shape, h_n.shape)       # torch.Size([4, 10, 16]) torch.Size([2, 4, 16])


# torch.nn.GRU: 门控循环单元，参数量少于 LSTM，实践中效果相当，训练更快
gru = nn.GRU(input_size=8, hidden_size=16, num_layers=2, batch_first=True)
out, h_n = gru(x)
print(out.shape)                   # torch.Size([4, 10, 16])

torch.Size([4, 10, 16]) torch.Size([2, 4, 16])
torch.Size([4, 10, 16]) torch.Size([2, 4, 16])
torch.Size([4, 10, 16])

In [22]:

Copied!





# torch.nn.LSTMCell: 单步 LSTM 单元，每次只处理一个时间步，适合需要手动控制时序逻辑的场景
lstm_cell = nn.LSTMCell(input_size=8, hidden_size=16)
x_t = torch.randn(4, 8)           # 单个时间步输入
h_t = torch.zeros(4, 16)          # 初始隐藏状态
c_t = torch.zeros(4, 16)          # 初始细胞状态
h_t, c_t = lstm_cell(x_t, (h_t, c_t))
print(h_t.shape, c_t.shape)       # torch.Size([4, 16]) torch.Size([4, 16])


# torch.nn.GRUCell: 单步 GRU 单元，每次只处理一个时间步，灵活性与 LSTMCell 相同
gru_cell = nn.GRUCell(input_size=8, hidden_size=16)
h_t = torch.zeros(4, 16)
h_t = gru_cell(x_t, h_t)
print(h_t.shape)                   # torch.Size([4, 16])
# torch.nn.LSTMCell: 单步 LSTM 单元，每次只处理一个时间步，适合需要手动控制时序逻辑的场景
lstm_cell = nn.LSTMCell(input_size=8, hidden_size=16)
x_t = torch.randn(4, 8)           # 单个时间步输入
h_t = torch.zeros(4, 16)          # 初始隐藏状态
c_t = torch.zeros(4, 16)          # 初始细胞状态
h_t, c_t = lstm_cell(x_t, (h_t, c_t))
print(h_t.shape, c_t.shape)       # torch.Size([4, 16]) torch.Size([4, 16])


# torch.nn.GRUCell: 单步 GRU 单元，每次只处理一个时间步，灵活性与 LSTMCell 相同
gru_cell = nn.GRUCell(input_size=8, hidden_size=16)
h_t = torch.zeros(4, 16)
h_t = gru_cell(x_t, h_t)
print(h_t.shape)                   # torch.Size([4, 16])

torch.Size([4, 16]) torch.Size([4, 16])
torch.Size([4, 16])

In [23]:

Copied!





# torch.nn.MultiheadAttention: 多头注意力机制，Transformer 的核心组件，
# 通过并行多组 Query-Key-Value 注意力捕获不同子空间的依赖关系
mha = nn.MultiheadAttention(embed_dim=64, num_heads=8, batch_first=True)
q = k = v = torch.randn(4, 10, 64)  # (batch=4, seq_len=10, d_model=64)
attn_out, attn_weights = mha(q, k, v)
print(attn_out.shape)             # torch.Size([4, 10, 64])
print(attn_weights.shape)         # torch.Size([4, 10, 10])  注意力分数矩阵


# torch.nn.Transformer: 完整的 Transformer 编解码器模型，包含 Encoder 和 Decoder 堆栈，
# 适用于机器翻译、序列到序列生成等任务
transformer = nn.Transformer(
    d_model=64, nhead=8, num_encoder_layers=3, num_decoder_layers=3, batch_first=True
)
src = torch.randn(4, 10, 64)      # 编码器输入
tgt = torch.randn(4, 6, 64)       # 解码器输入
out = transformer(src, tgt)
print(out.shape)                  # torch.Size([4, 6, 64])
# torch.nn.MultiheadAttention: 多头注意力机制，Transformer 的核心组件，
# 通过并行多组 Query-Key-Value 注意力捕获不同子空间的依赖关系
mha = nn.MultiheadAttention(embed_dim=64, num_heads=8, batch_first=True)
q = k = v = torch.randn(4, 10, 64)  # (batch=4, seq_len=10, d_model=64)
attn_out, attn_weights = mha(q, k, v)
print(attn_out.shape)             # torch.Size([4, 10, 64])
print(attn_weights.shape)         # torch.Size([4, 10, 10])  注意力分数矩阵


# torch.nn.Transformer: 完整的 Transformer 编解码器模型，包含 Encoder 和 Decoder 堆栈，
# 适用于机器翻译、序列到序列生成等任务
transformer = nn.Transformer(
    d_model=64, nhead=8, num_encoder_layers=3, num_decoder_layers=3, batch_first=True
)
src = torch.randn(4, 10, 64)      # 编码器输入
tgt = torch.randn(4, 6, 64)       # 解码器输入
out = transformer(src, tgt)
print(out.shape)                  # torch.Size([4, 6, 64])

torch.Size([4, 10, 64])
torch.Size([4, 10, 10])
torch.Size([4, 6, 64])

损失函数¶

In [24]:

Copied!





# torch.nn.MSELoss: 均方误差损失，计算预测值与真实值差的平方均值，常用于回归任务
mse = nn.MSELoss()
pred = torch.tensor([2.5, 3.0, 4.0])
target = torch.tensor([3.0, 3.0, 3.5])
print(mse(pred, target))          # tensor(0.1667)


# torch.nn.BCELoss: 二元交叉熵损失，要求输入已经过 Sigmoid，用于二分类或多标签分类
bce = nn.BCELoss()
pred_sigmoid = torch.sigmoid(torch.tensor([0.8, -0.5, 1.2]))
target_bin = torch.tensor([1.0, 0.0, 1.0])
print(bce(pred_sigmoid, target_bin))   # 二元交叉熵值


# torch.nn.NLLLoss: 负对数似然损失，要求输入为 log 概率（通常来自 LogSoftmax），用于多分类
nll = nn.NLLLoss()
log_probs = torch.log_softmax(torch.randn(4, 5), dim=1)
labels = torch.tensor([0, 2, 1, 4])
print(nll(log_probs, labels))
# torch.nn.MSELoss: 均方误差损失，计算预测值与真实值差的平方均值，常用于回归任务
mse = nn.MSELoss()
pred = torch.tensor([2.5, 3.0, 4.0])
target = torch.tensor([3.0, 3.0, 3.5])
print(mse(pred, target))          # tensor(0.1667)


# torch.nn.BCELoss: 二元交叉熵损失，要求输入已经过 Sigmoid，用于二分类或多标签分类
bce = nn.BCELoss()
pred_sigmoid = torch.sigmoid(torch.tensor([0.8, -0.5, 1.2]))
target_bin = torch.tensor([1.0, 0.0, 1.0])
print(bce(pred_sigmoid, target_bin))   # 二元交叉熵值


# torch.nn.NLLLoss: 负对数似然损失，要求输入为 log 概率（通常来自 LogSoftmax），用于多分类
nll = nn.NLLLoss()
log_probs = torch.log_softmax(torch.randn(4, 5), dim=1)
labels = torch.tensor([0, 2, 1, 4])
print(nll(log_probs, labels))

tensor(0.1667)
tensor(0.3695)
tensor(2.3808)

In [25]:

Copied!





# torch.nn.L1Loss: 绝对误差损失（MAE），对异常值鲁棒性优于 MSE，梯度在零点不连续
l1 = nn.L1Loss()
pred = torch.tensor([2.5, 3.0, 4.0])
target = torch.tensor([3.0, 3.0, 3.5])
print(l1(pred, target))           # tensor(0.3333)


# torch.nn.SmoothL1Loss: 平滑 L1 损失（Huber Loss），误差小时近似 L2，误差大时近似 L1，
# 兼顾稳定性与鲁棒性，常用于目标检测回归分支
smooth_l1 = nn.SmoothL1Loss()
print(smooth_l1(pred, target))    # 介于 L1 和 L2 之间的值
# torch.nn.L1Loss: 绝对误差损失（MAE），对异常值鲁棒性优于 MSE，梯度在零点不连续
l1 = nn.L1Loss()
pred = torch.tensor([2.5, 3.0, 4.0])
target = torch.tensor([3.0, 3.0, 3.5])
print(l1(pred, target))           # tensor(0.3333)


# torch.nn.SmoothL1Loss: 平滑 L1 损失（Huber Loss），误差小时近似 L2，误差大时近似 L1，
# 兼顾稳定性与鲁棒性，常用于目标检测回归分支
smooth_l1 = nn.SmoothL1Loss()
print(smooth_l1(pred, target))    # 介于 L1 和 L2 之间的值

tensor(0.3333)
tensor(0.0833)

In [26]:

Copied!





# torch.nn.CrossEntropyLoss: 交叉熵损失，内部融合了 LogSoftmax 和 NLLLoss，
# 直接接受原始 logits（无需手动 softmax），多分类任务的首选损失函数
ce = nn.CrossEntropyLoss()
logits = torch.randn(4, 5)        # (batch=4, num_classes=5)  原始 logits
labels = torch.tensor([0, 2, 1, 4])
print(ce(logits, labels))

# 支持类别权重（缓解样本不均衡问题）
weights = torch.tensor([1.0, 1.0, 2.0, 1.0, 1.0])  # 对第2类赋予更高权重
ce_weighted = nn.CrossEntropyLoss(weight=weights)
print(ce_weighted(logits, labels))
# torch.nn.CrossEntropyLoss: 交叉熵损失，内部融合了 LogSoftmax 和 NLLLoss，
# 直接接受原始 logits（无需手动 softmax），多分类任务的首选损失函数
ce = nn.CrossEntropyLoss()
logits = torch.randn(4, 5)        # (batch=4, num_classes=5)  原始 logits
labels = torch.tensor([0, 2, 1, 4])
print(ce(logits, labels))

# 支持类别权重（缓解样本不均衡问题）
weights = torch.tensor([1.0, 1.0, 2.0, 1.0, 1.0])  # 对第2类赋予更高权重
ce_weighted = nn.CrossEntropyLoss(weight=weights)
print(ce_weighted(logits, labels))

tensor(1.3970)
tensor(1.4000)

In [ ]:

Copied!





# torch.nn.PoissonNLLLoss: 泊松负对数似然损失，用于目标值为计数或事件率的回归任务（如推荐系统 CTR 预测）
poisson_nll = nn.PoissonNLLLoss(log_input=True)  # log_input=True 表示输入已取 log
log_rate = torch.tensor([0.5, 1.0, 2.0])         # log(λ) 预测值
target   = torch.tensor([1.0, 2.0, 5.0])         # 实际计数
print(poisson_nll(log_rate, target))
# torch.nn.PoissonNLLLoss: 泊松负对数似然损失，用于目标值为计数或事件率的回归任务（如推荐系统 CTR 预测）
poisson_nll = nn.PoissonNLLLoss(log_input=True)  # log_input=True 表示输入已取 log
log_rate = torch.tensor([0.5, 1.0, 2.0])         # log(λ) 预测值
target   = torch.tensor([1.0, 2.0, 5.0])         # 实际计数
print(poisson_nll(log_rate, target))

tensor(-0.2480)

In [ ]:

Copied!





# torch.nn.BCEWithLogitsLoss: 将 Sigmoid 层与 BCELoss 合并，数值更稳定（利用 log-sum-exp 技巧），
# 推荐优先使用此函数替代手动 Sigmoid + BCELoss
bce_logits = nn.BCEWithLogitsLoss()
logits = torch.tensor([2.0, -1.0, 0.5, -3.0])   # 原始 logits，无需提前 sigmoid
target = torch.tensor([1.0,  0.0, 1.0,  0.0])
print(bce_logits(logits, target))
# torch.nn.BCEWithLogitsLoss: 将 Sigmoid 层与 BCELoss 合并，数值更稳定（利用 log-sum-exp 技巧），
# 推荐优先使用此函数替代手动 Sigmoid + BCELoss
bce_logits = nn.BCEWithLogitsLoss()
logits = torch.tensor([2.0, -1.0, 0.5, -3.0])   # 原始 logits，无需提前 sigmoid
target = torch.tensor([1.0,  0.0, 1.0,  0.0])
print(bce_logits(logits, target))

tensor(0.2407)

In [ ]:

Copied!





# torch.nn.KLDivLoss: KL 散度损失，衡量预测概率分布与目标概率分布之间的 Kullback-Leibler 散度，
# 要求输入为 log 概率，目标为概率，常用于知识蒸馏和变分自编码器（VAE）
kl = nn.KLDivLoss(reduction='batchmean')
log_pred = torch.log_softmax(torch.randn(4, 5), dim=1)  # 学生模型 log 概率
target_dist = torch.softmax(torch.randn(4, 5), dim=1)   # 教师模型概率分布
print(kl(log_pred, target_dist))
# torch.nn.KLDivLoss: KL 散度损失，衡量预测概率分布与目标概率分布之间的 Kullback-Leibler 散度，
# 要求输入为 log 概率，目标为概率，常用于知识蒸馏和变分自编码器（VAE）
kl = nn.KLDivLoss(reduction='batchmean')
log_pred = torch.log_softmax(torch.randn(4, 5), dim=1)  # 学生模型 log 概率
target_dist = torch.softmax(torch.randn(4, 5), dim=1)   # 教师模型概率分布
print(kl(log_pred, target_dist))

tensor(0.4662)

In [ ]:

Copied!





# torch.nn.CosineEmbeddingLoss: 余弦相似度嵌入损失，label=1 时使两向量相似，label=-1 时推远，
# 常用于学习度量表示（如双塔召回模型、句子相似度）
cos_emb = nn.CosineEmbeddingLoss(margin=0.0)
x1 = torch.randn(4, 8)           # 用户/query 向量
x2 = torch.randn(4, 8)           # 物品/document 向量
label = torch.tensor([1, -1, 1, -1])  # 1 表示正样本对，-1 表示负样本对
print(cos_emb(x1, x2, label))
# torch.nn.CosineEmbeddingLoss: 余弦相似度嵌入损失，label=1 时使两向量相似，label=-1 时推远，
# 常用于学习度量表示（如双塔召回模型、句子相似度）
cos_emb = nn.CosineEmbeddingLoss(margin=0.0)
x1 = torch.randn(4, 8)           # 用户/query 向量
x2 = torch.randn(4, 8)           # 物品/document 向量
label = torch.tensor([1, -1, 1, -1])  # 1 表示正样本对，-1 表示负样本对
print(cos_emb(x1, x2, label))

tensor(0.6027)

In [ ]:

Copied!





# torch.nn.HingeEmbeddingLoss: 合页嵌入损失，label=1 时直接取输入值，label=-1 时取 max(0, margin-x)，
# 用于学习基于距离的嵌入（如 SVM 风格的度量学习）
hinge_emb = nn.HingeEmbeddingLoss(margin=1.0)
x = torch.tensor([0.8, 1.5, 0.3, 2.0])  # 距离值
label = torch.tensor([1, -1, 1, -1])     # 1 表示相似，-1 表示不相似
print(hinge_emb(x, label))
# torch.nn.HingeEmbeddingLoss: 合页嵌入损失，label=1 时直接取输入值，label=-1 时取 max(0, margin-x)，
# 用于学习基于距离的嵌入（如 SVM 风格的度量学习）
hinge_emb = nn.HingeEmbeddingLoss(margin=1.0)
x = torch.tensor([0.8, 1.5, 0.3, 2.0])  # 距离值
label = torch.tensor([1, -1, 1, -1])     # 1 表示相似，-1 表示不相似
print(hinge_emb(x, label))

tensor(0.2750)

In [ ]:

Copied!





# torch.nn.MarginRankingLoss: 间隔排序损失，使正样本得分高于负样本至少 margin，
# 常用于 Learning to Rank 排序任务和推荐系统 Pairwise 训练
margin_rank = nn.MarginRankingLoss(margin=0.3)
x1 = torch.tensor([0.9, 0.6, 0.8])   # 正样本得分
x2 = torch.tensor([0.3, 0.7, 0.5])   # 负样本得分
label = torch.ones(3)                  # 1 表示希望 x1 > x2
print(margin_rank(x1, x2, label))
# torch.nn.MarginRankingLoss: 间隔排序损失，使正样本得分高于负样本至少 margin，
# 常用于 Learning to Rank 排序任务和推荐系统 Pairwise 训练
margin_rank = nn.MarginRankingLoss(margin=0.3)
x1 = torch.tensor([0.9, 0.6, 0.8])   # 正样本得分
x2 = torch.tensor([0.3, 0.7, 0.5])   # 负样本得分
label = torch.ones(3)                  # 1 表示希望 x1 > x2
print(margin_rank(x1, x2, label))

tensor(0.1333)

In [ ]:

Copied!





# torch.nn.TripletMarginLoss: 三元组间隔损失，目标是使 d(anchor, positive) + margin < d(anchor, negative)，
# 用于度量学习（人脸识别、图文匹配），使相似样本距离更近、不相似样本距离更远
triplet = nn.TripletMarginLoss(margin=1.0)
anchor   = torch.randn(4, 8)
positive = torch.randn(4, 8)    # 与 anchor 同类的样本
negative = torch.randn(4, 8)    # 与 anchor 不同类的样本
print(triplet(anchor, positive, negative))
# torch.nn.TripletMarginLoss: 三元组间隔损失，目标是使 d(anchor, positive) + margin < d(anchor, negative)，
# 用于度量学习（人脸识别、图文匹配），使相似样本距离更近、不相似样本距离更远
triplet = nn.TripletMarginLoss(margin=1.0)
anchor   = torch.randn(4, 8)
positive = torch.randn(4, 8)    # 与 anchor 同类的样本
negative = torch.randn(4, 8)    # 与 anchor 不同类的样本
print(triplet(anchor, positive, negative))

tensor(0.4683)

In [ ]:

Copied!





# torch.nn.CTCLoss: 连续时间分类损失（Connectionist Temporal Classification），
# 用于序列到序列的学习任务（如语音识别、手写识别），无需输入与标签的精确对齐
ctc = nn.CTCLoss(blank=0, reduction='mean')
# log_probs: (T=序列长度, N=batch, C=类别数)
log_probs = torch.log_softmax(torch.randn(20, 4, 10), dim=2)
targets = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])  # 拼接的目标序列
input_lengths  = torch.full((4,), 20, dtype=torch.long)  # 每个样本的输入长度
target_lengths = torch.tensor([2, 2, 2, 2], dtype=torch.long)  # 每个样本的目标长度
print(ctc(log_probs, targets, input_lengths, target_lengths))
# torch.nn.CTCLoss: 连续时间分类损失（Connectionist Temporal Classification），
# 用于序列到序列的学习任务（如语音识别、手写识别），无需输入与标签的精确对齐
ctc = nn.CTCLoss(blank=0, reduction='mean')
# log_probs: (T=序列长度, N=batch, C=类别数)
log_probs = torch.log_softmax(torch.randn(20, 4, 10), dim=2)
targets = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8])  # 拼接的目标序列
input_lengths  = torch.full((4,), 20, dtype=torch.long)  # 每个样本的输入长度
target_lengths = torch.tensor([2, 2, 2, 2], dtype=torch.long)  # 每个样本的目标长度
print(ctc(log_probs, targets, input_lengths, target_lengths))

tensor(19.9956)

In [ ]:

Copied!





# torch.nn.MultiLabelSoftMarginLoss: 多标签分类损失，基于 Sigmoid 对每个类别独立计算交叉熵，
# 允许一个样本同时属于多个类别
ml_soft = nn.MultiLabelSoftMarginLoss()
logits = torch.randn(4, 5)        # (batch=4, num_classes=5)
target = torch.tensor([[1,0,1,0,0],[0,1,0,1,0],[1,1,0,0,0],[0,0,0,1,1]], dtype=torch.float)
print(ml_soft(logits, target))


# torch.nn.MultiLabelMarginLoss: 多标签合页损失，基于 margin 约束正类得分高于负类，
# 相比 SoftMargin 版本对排序更敏感
ml_margin = nn.MultiLabelMarginLoss()
logits2 = torch.randn(2, 5)
target2 = torch.tensor([[0, 2, -1, -1, -1],  # -1 表示忽略
                         [1, 3,  4, -1, -1]], dtype=torch.long)
print(ml_margin(logits2, target2))
# torch.nn.MultiLabelSoftMarginLoss: 多标签分类损失，基于 Sigmoid 对每个类别独立计算交叉熵，
# 允许一个样本同时属于多个类别
ml_soft = nn.MultiLabelSoftMarginLoss()
logits = torch.randn(4, 5)        # (batch=4, num_classes=5)
target = torch.tensor([[1,0,1,0,0],[0,1,0,1,0],[1,1,0,0,0],[0,0,0,1,1]], dtype=torch.float)
print(ml_soft(logits, target))


# torch.nn.MultiLabelMarginLoss: 多标签合页损失，基于 margin 约束正类得分高于负类，
# 相比 SoftMargin 版本对排序更敏感
ml_margin = nn.MultiLabelMarginLoss()
logits2 = torch.randn(2, 5)
target2 = torch.tensor([[0, 2, -1, -1, -1],  # -1 表示忽略
                         [1, 3,  4, -1, -1]], dtype=torch.long)
print(ml_margin(logits2, target2))

tensor(0.9439)
tensor(1.7488)

In [ ]:

Copied!





# torch.nn.SoftMarginLoss: 二分类合页损失的软化版本，使用 log(1+exp(-y*x)) 替代硬 hinge，
# 标签须为 +1 或 -1
soft_margin = nn.SoftMarginLoss()
logits = torch.tensor([1.5, -0.5, 2.0, -1.0])
labels = torch.tensor([1.0, -1.0, 1.0, -1.0])
print(soft_margin(logits, labels))


# torch.nn.MultiMarginLoss: 多分类合页损失，约束正类得分比每个负类得分至少高出 margin，
# 是线性 SVM 多分类目标函数的 PyTorch 实现
multi_margin = nn.MultiMarginLoss(margin=1.0)
logits = torch.randn(4, 5)        # (batch=4, num_classes=5)
labels = torch.tensor([0, 2, 1, 4])
print(multi_margin(logits, labels))
# torch.nn.SoftMarginLoss: 二分类合页损失的软化版本，使用 log(1+exp(-y*x)) 替代硬 hinge，
# 标签须为 +1 或 -1
soft_margin = nn.SoftMarginLoss()
logits = torch.tensor([1.5, -0.5, 2.0, -1.0])
labels = torch.tensor([1.0, -1.0, 1.0, -1.0])
print(soft_margin(logits, labels))


# torch.nn.MultiMarginLoss: 多分类合页损失，约束正类得分比每个负类得分至少高出 margin，
# 是线性 SVM 多分类目标函数的 PyTorch 实现
multi_margin = nn.MultiMarginLoss(margin=1.0)
logits = torch.randn(4, 5)        # (batch=4, num_classes=5)
labels = torch.tensor([0, 2, 1, 4])
print(multi_margin(logits, labels))

tensor(0.2789)
tensor(0.7076)