代码清单3-5 多头注意力

import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec

inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your     (x^1)
    [0.55, 0.87, 0.66], # journey  (x^2)
    [0.57, 0.85, 0.64], # starts
    [0.22, 0.58, 0.33], # with
    [0.77, 0.25, 0.10], # one
    [0.05, 0.80, 0.55]] # step
)
batch = torch.stack((inputs, inputs), dim=0)

torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape # (2, 6, 3)
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs.shape, context_vecs)

代码清单4-2 层归一化类

层归一化:减去均值,并将结果除以方差的平方根(也就是标准差)。归一化后的层输出也包含负值,其均值为 0,方差为 1

层归一化,提高神经网络训练的稳定性和效率。层归一化的主要思想是调整神经网络层的激活(输出),使其均值为 0 且方差(单位方差)为 1。这种调整有助于加速权重的有效收敛,并确保训练过程的一致性和可靠性。

import torch
import torch.nn as nn

torch.manual_seed(123)
batch_example = torch.randn(2, 5)

layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)
"""
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
"""

# 均值
mean = out.mean(dim=-1, keepdim=True)
# 方差
var = out.var(dim=-1, keepdim=True)
print("Mean:", mean)
print("Variance:", var)
"""
Mean: tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance: tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)
"""


# 归一化
out_norm = (out - mean) / torch.sqrt(var)
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("out_norm:", out_norm)
"""
out_norm: tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
"""

torch.set_printoptions(sci_mode=False)
print("mean:", mean)
print("variance:", var)
"""
mean: tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
"""

import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


torch.manual_seed(123)
batch_example = torch.randn(2, 5)

ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)

torch.set_printoptions(sci_mode=False)
print("Mean:", mean)
print("Variance:", var)
"""
Mean: tensor([[     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)
Variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
"""

代码清单4-3 GELU激活函数

import torch
import torch.nn as nn
import matplotlib.pyplot as plt

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


gelu, relu = GELU(), nn.ReLU()

x = torch.linspace(-3, 3, 100) # 在-3 和 3 之间创建 100 个样本数据点
y_gelu, y_relu = gelu(x), relu(x)
plt.figure(figsize=(8,3))
for i, (y, label) in enumerate(zip([y_gelu, y_relu], ["GELU", "ReLU"]), 1):
    plt.subplot(1, 2, i)
    plt.plot(x, y)
    plt.title(f"{label} activation function") # 图标标题
    plt.xlabel("x") # X轴标签
    plt.ylabel(f"{label}(x)")
    plt.grid(True) # 显示网格
plt.tight_layout() # 自动调整图表布局,防止标签重叠
plt.savefig('gelu_relu')
plt.show()


代码清单4-5 梯度消失&快捷连接(跳跃连接或残差连接)

梯度消失:指的是在训练过程中,梯度在反向传播时逐渐变小,导致早期网络层难以有效训练。

快捷连接:最初用 于计算机视觉中的深度网络(特别是残差网络),目的是缓解梯度消失问题。通过跳过一个或多个层,为梯度在网络中的流动提供了一条可替代且更短的路径。这是通过将一层的输出添加到后续层的输出中实现的。

import torch
import torch.nn as nn

class ExampleDeepNeuralNetwork(nn.Module):
    def __init__(self, layer_sizes, use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_sizes[0], layer_sizes[1]),
                GELU()),
            nn.Sequential(nn.Linear(layer_sizes[1], layer_sizes[2]),
                GELU()),
            nn.Sequential(nn.Linear(layer_sizes[2], layer_sizes[3]),
                GELU()),
            nn.Sequential(nn.Linear(layer_sizes[3], layer_sizes[4]),
                GELU()),
            nn.Sequential(nn.Linear(layer_sizes[4], layer_sizes[5]),
                GELU()),
            ])
    def forward(self, x):
        for layer in self.layers:
            layer_output = layer(x) # 计算当前层的输出
            if self.use_shortcut and x.shape == layer_output.shape: # 检查是否可以使用快捷连接
                x = x + layer_output
            else:
                x = layer_output
        return x


def print_gradients(model, x):
    output = model(x) # 前向传播
    target = torch.tensor([[0.]]) # 为简化处理,这里设为 0

    loss = nn.MSELoss()
    loss = loss(output, target) # 基于目标和输出之间的差距来计算损失

    loss.backward() # 反向传播来计算梯度

    for name, param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")


layer_sizes = [3, 3, 3, 3, 3, 1]
sample_input = torch.tensor([[1., 0., -1.]])
torch.manual_seed(123) # 指定随机种子,用于初始化权重,以确保结果可复现
model_without_shortcut = ExampleDeepNeuralNetwork(
        layer_sizes, use_shortcut=True
)
print(model_without_shortcut)
print_gradients(model_without_shortcut, sample_input)

"""
ExampleDeepNeuralNetwork(
  (layers): ModuleList(
    (0-3): 4 x Sequential(
      (0): Linear(in_features=3, out_features=3, bias=True)
      (1): GELU()
    )
    (4): Sequential(
      (0): Linear(in_features=3, out_features=1, bias=True)
      (1): GELU()
    )
  )
)


use_shortcut=False
layers.0.0.weight has gradient mean of 0.00020173584925942123
layers.1.0.weight has gradient mean of 0.00012011159560643137
layers.2.0.weight has gradient mean of 0.0007152040489017963
layers.3.0.weight has gradient mean of 0.0013988736318424344
layers.4.0.weight has gradient mean of 0.005049645435065031
梯度在从最后一层(layers.4)到第 1 层(layers.0) 的过程中逐渐变小,这种现象称为梯度消失问题。

use_shortcut=True
layers.0.0.weight has gradient mean of 0.22169792652130127
layers.1.0.weight has gradient mean of 0.20694108307361603
layers.2.0.weight has gradient mean of 0.3289699852466583
layers.3.0.weight has gradient mean of 0.2665732204914093
layers.4.0.weight has gradient mean of 1.3258541822433472
最后一层(layers.4)的梯度仍然大于其他层。然而,梯度值在逐渐接近第 1 层(layers.0) 时趋于稳定,并且没有缩小到几乎消失的程度。
"""

代码清单4-6 Transformer block

Transformer 块的核心思想是,自注意力机制在多头注意力块中用于识别和分析输入序列中元素之间的关系。前馈神经网络则在每个位置上对数据进行单独的修改。

输出是一个包含整个输入序列信息的上下 文向量。这意味着,虽然序列的物理维度(长度和特征尺寸)在通过 Transformer 块时保持不变, 但每个输出向量的内容都要重新编码,以整合来自整个输入序列的上下文信息。

import torch
import torch.nn as nn


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # 在注意力块中添加快捷连接
        """
        层归一化(LayerNorm)应用于这两个组件之前,而 dropout 应用于这两个组件之后,
        以便对模型进行正则化并防止过拟合。这种方法也被称为前层归一化(Pre-LayerNorm)。
        """
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        """
        每个组件后面都跟着一个快捷连接,将块 的输入加到其输出上。
        这个关键特性有助于在训练过程中使梯度在网络中流动,并改善深度模型的学习效果
        """
        x = x + shortcut # 将原始输入添加回来

        shortcut = x # 在前馈层中添加快捷链接
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 将原始输入添加回来
        return x


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

print("Input:", x.shape)
print("Output:", output.shape)
"""
Input: torch.Size([2, 4, 768])
Output: torch.Size([2, 4, 768])
"""

代码清单4-7 实现GPT模型

import torch
import torch.nn as nn
import tiktoken

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # 在注意力块中添加快捷连接
        """
        层归一化(LayerNorm)应用于这两个组件之前,而 dropout 应用于这两个组件之后,
        以便对模型进行正则化并防止过拟合。这种方法也被称为前层归一化(Pre-LayerNorm)。
        """
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        """
        每个组件后面都跟着一个快捷连接,将块 的输入加到其输出上。
        这个关键特性有助于在训练过程中使梯度在网络中流动,并改善深度模型的学习效果
        """
        x = x + shortcut # 将原始输入添加回来

        shortcut = x # 在前馈层中添加快捷链接
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 将原始输入添加回来
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # 词元嵌入层和位置嵌入层负责将输入的词元索引转换为稠密向量,并添加位置信息
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # LayerNorm层将Transformer块的输出标准化,以稳定学习过程
        self.final_norm = LayerNorm(cfg["emb_dim"])
        # 无偏置的线性输出头,将Transformer的输出投影到分词器的词汇空间,为词汇中的每个词元生成分数logits
        self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx) # 词元嵌入层
        pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
        ) # 位置嵌入层
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x) # dropout
        x = self.trf_blocks(x) # transformer blocks
        x = self.final_norm(x) # 最终层归一化
        # 计算logits,这些logits代表下一个词元的非归一化概率
        # 最后一个线性层会将每个词元向量嵌入 50257 维的嵌入层,其中 50257 是词汇表的大小
        # 这个线性输出层将 Transformer 的输出映射到一个高维空间,以预测序列中的下一个词元
        logits = self.out_head(x) # 线性输出层
        return logits
        

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}


tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input:", batch)
print("Output:", out.shape, out)


# print(model)
# 模型参数张量的总参数量
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")
# 权重共享(weight tying)
print("Token embedding layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)
total_params_gpt2 = (
    total_params - sum(p.numel() 
    for p in model.out_head.parameters())
)
print(f"Number of trainable parameters "
      f"considering weight tying: {total_params_gpt2:,}")
# 计算模型参数的内存需求
total_size_bytes = total_params * 4 # 假设每个参数是4字节的32位浮点数
total_size_mb = total_size_bytes / 1024 / 1024
print(f"Total size of the model: {total_size_mb:.2f} MB")

"""
Input: tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output: torch.Size([2, 4, 50257]) tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)
Total number of parameters: 163,009,536
Token embedding layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])
Number of trainable parameters considering weight tying: 124,412,160
Total size of the model: 621.83 MB
"""

代码清单4-8 生成文本

  • 层归一化:可以确保每个层的输出具有一致的均值和方差,从而稳定训练过程。
    • 减去均值,再除以方差的平方根(也就是标准差);
    • 归一化后的层输出包含负值,其均值为 0,方差为 1
    • 现代大语言模型中常用的一个LayerNorm变体是RMSNorm(Root Mean Square Layer Normalization),因为它的计算效率更高。RMSNorm通过仅使用输入的均方根进行归一化简化了归一化过程,无须在平方之前减去均值。这意味着在计算放缩之前,它不会对数据进行中心化操作。
  • 快捷连接(跳跃连接/残差连接):是通过将一层的输出直接传递到更深层来跳过一个或多个层的连接,它能帮助缓解在训练深度神经网络(如大语言模型)时遇到的梯度消失问题。
  • Transformer块:融合了掩码多头注意力模块和使用GELU激活函数的全连接前馈神经网络
    • 高斯误差线性单元GELU激活函数:结合了经典的ReLU激活函数与正态分布的累积分布函数的特性,能够有效建模层输出,在深度学习模型中实现随机正则化和非线性。
  • GPT模型:是具有许多重复Transformer块的大语言模型,这些Transformer块有数百万到数十亿个参数。
import torch
import torch.nn as nn
import tiktoken

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # 在注意力块中添加快捷连接
        """
        层归一化(LayerNorm)应用于这两个组件之前,而 dropout 应用于这两个组件之后,
        以便对模型进行正则化并防止过拟合。这种方法也被称为前层归一化(Pre-LayerNorm)。
        """
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        """
        每个组件后面都跟着一个快捷连接,将块 的输入加到其输出上。
        这个关键特性有助于在训练过程中使梯度在网络中流动,并改善深度模型的学习效果
        """
        x = x + shortcut # 将原始输入添加回来

        shortcut = x # 在前馈层中添加快捷链接
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 将原始输入添加回来
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # 词元嵌入层和位置嵌入层负责将输入的词元索引转换为稠密向量,并添加位置信息
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # LayerNorm层将Transformer块的输出标准化,以稳定学习过程
        self.final_norm = LayerNorm(cfg["emb_dim"])
        # 无偏置的线性输出头,将Transformer的输出投影到分词器的词汇空间,为词汇中的每个词元生成分数logits
        self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx) # 词元嵌入层
        pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
        ) # 位置嵌入层
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x) # dropout
        x = self.trf_blocks(x) # transformer blocks
        x = self.final_norm(x) # 最终层归一化
        # 计算logits,这些logits代表下一个词元的非归一化概率
        # 最后一个线性层会将每个词元向量嵌入 50257 维的嵌入层,其中 50257 是词汇表的大小
        # 这个线性输出层将 Transformer 的输出映射到一个高维空间,以预测序列中的下一个词元
        logits = self.out_head(x) # 线性输出层
        return logits
        
# 入参 idx 是当前文本的索引数组,其形状为(batch, n_tokens)
def generate_text_simple(model, idx, 
            max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # 将当前文本截断至支持的长度。如果大语言 模型仅支持 5 个词元,
        # 但此时文本长度为 10, 则只有最后 5 个词元会被用作输入文本
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        # 只关注最后一个输出的内容,因此形状会从(batch, n_token, vocab_size) 变为(batch, vocab_size)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1) # probas 的形状为(batch, vocab_size)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # idx_next 的形状为(batch, 1)
        idx = torch.cat((idx, idx_next), dim=1) # 将计算出的下一个字符的索引添加 到索引数组中,此时 idx 的形状会变为(batch, n_tokens+1)
    return idx


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}


start_context = "Hello, I am"
tokenizer = tiktoken.get_encoding("gpt2")
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) # 添加 batch 维度
print("encoded_tensor:", encoded_tensor.shape, encoded_tensor)

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

model.eval() # 将模型设置为.eval()模式,禁用诸如 dropout 等只在训练期间使用的随机组件
out = generate_text_simple(
        model=model,
        idx=encoded_tensor,
        max_new_tokens=6,
        context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", len(out[0]), out)
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

"""
encoded: [15496, 11, 314, 716]
encoded_tensor: torch.Size([1, 4]) tensor([[15496,    11,   314,   716]])
Output: 10 tensor([[15496,    11,   314,   716, 27018, 24086, 47843, 30961, 42348,  7267]])
Hello, I am Featureiman Byeswickattribute argue
"""

章节5.1.2 负平均对数概率&交叉损失熵

训练大语言模型的目标是最大化正确词元的可能性,这涉及增大其相对于其他词元的概率。通过 这种方式,可以确保大语言模型始终选择目标词元(实质上是句子中的下一个单词)作为它生成 的下一个词元。

反向传播

如何最大化与目标词元对应的 softmax 概率值呢?大致思路是,更新模型权重,以便模型为我们想要生成的相应词元 ID 输出更高的值。权重更新是通过一种称为反向传播的过程完成的,这是训练深度神经网络的标准技术。

反向传播需要一个损失函数,它会计算模型的预测输出(在这里是与目标词元 ID 对应的 概率)与实际期望输出之间的差异。这个损失函数衡量的是模型的预测与目标值之间的偏差。

交叉熵损失

在机器学习和深度学习中,交叉熵损失是一种常用的度量方式,用于衡量两个概率分布 之间的差异——通常是标签(在这里是数据集中的词元)的真实分布和模型生成的预测分布(例如,由大语言模型生成的词元概率)之间的差异。

在机器学习的背景下,特别是在像 PyTorch 这样的框架中,交叉熵函数可以对离散的结 果进行度量,类似于给定模型生成的词元概率时目标词元的负平均对数概率。因此,在实践 中,“交叉熵”和“负平均对数概率”这两个术语是相关的,且经常可以互换使用。

困惑度

困惑度通常与交叉熵损失一起用来评估模型在诸如语言建模等任务中的性能。它可以提 供一种更易解释的方式来理解模型在预测序列中的下一个词元时的不确定性。

困惑度可以衡量模型预测的概率分布与数据集中实际词汇分布的匹配程度。与损失类 似,较低的困惑度表明模型的预测更接近实际分布。

困惑度可以通过 perplexity = torch.exp(loss)计算得出,在先前计算的损失上应 用该公式会得到 tensor(48725.8203)。

困惑度通常被认为比原始损失值更易于解释,因为它表示模型在每一步中对于有效词汇 量的不确定性。在给定的示例中,这意味着模型不确定在词汇表的 48 725 个词元中应该生成 哪个来作为下一个词元。

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval() # 将模型设置为.eval()模式,禁用诸如 dropout 等只在训练期间使用的随机组件
tokenizer = tiktoken.get_encoding("gpt2")

inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]
targets = torch.tensor([[3626, 6100, 345 ],   # [" effort moves you",
                        [1107, 588, 11311]])  # " really like chocolate"])

# 1. logits
with torch.no_grad():
    logits = model(inputs)
# 2. 概率
probas = torch.softmax(logits, dim=-1)
print(probas.shape)
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
#print("Token IDs:", token_ids)
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

# 3. 目标概率
text_idx = 0
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
print("Text 1:", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0,1,2], targets[text_idx]]
print("Text 2:", target_probas_2)

# 4. 对数概率
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)
# 5. 平均对数概率
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)
# 6. 负平均对数概率
"""
负平均对数概率就是需要计算的损失。
目标是通过在训练过程中更新模型的权重,使平均对数概率尽可能接近 0。
然而,在深度 学习中,通常的做法不是将平均对数概率升至 0,而是将负平均对数概率降至 0。
"""
net_avg_log_probas = avg_log_probas * -1
print(net_avg_log_probas)

# 交叉熵损失
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

"""
tensor([[16833,  3626,  6100,   345]])
tensor([[   40,  1107,   588, 11311]])
torch.Size([2, 3, 50257])

Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix
Text 1: tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])
tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])
tensor(-10.7940)
tensor(10.7940)
Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])
tensor(10.7940)
"""

代码清单5-2 计算训练集和验证集的损失

import torch
import torch.nn as nn
import tiktoken
from torch.utils.data import Dataset, DataLoader

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # 在注意力块中添加快捷连接
        """
        层归一化(LayerNorm)应用于这两个组件之前,而 dropout 应用于这两个组件之后,
        以便对模型进行正则化并防止过拟合。这种方法也被称为前层归一化(Pre-LayerNorm)。
        """
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        """
        每个组件后面都跟着一个快捷连接,将块 的输入加到其输出上。
        这个关键特性有助于在训练过程中使梯度在网络中流动,并改善深度模型的学习效果
        """
        x = x + shortcut # 将原始输入添加回来

        shortcut = x # 在前馈层中添加快捷链接
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 将原始输入添加回来
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # 词元嵌入层和位置嵌入层负责将输入的词元索引转换为稠密向量,并添加位置信息
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # LayerNorm层将Transformer块的输出标准化,以稳定学习过程
        self.final_norm = LayerNorm(cfg["emb_dim"])
        # 无偏置的线性输出头,将Transformer的输出投影到分词器的词汇空间,为词汇中的每个词元生成分数logits
        self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx) # 词元嵌入层
        pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
        ) # 位置嵌入层
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x) # dropout
        x = self.trf_blocks(x) # transformer blocks
        x = self.final_norm(x) # 最终层归一化
        # 计算logits,这些logits代表下一个词元的非归一化概率
        # 最后一个线性层会将每个词元向量嵌入 50257 维的嵌入层,其中 50257 是词汇表的大小
        # 这个线性输出层将 Transformer 的输出映射到一个高维空间,以预测序列中的下一个词元
        logits = self.out_head(x) # 线性输出层
        return logits
        
# 入参 idx 是当前文本的索引数组,其形状为(batch, n_tokens)
def generate_text_simple(model, idx, 
            max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # 将当前文本截断至支持的长度。如果大语言 模型仅支持 5 个词元,
        # 但此时文本长度为 10, 则只有最后 5 个词元会被用作输入文本
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        # 只关注最后一个输出的内容,因此形状会从(batch, n_token, vocab_size) 变为(batch, vocab_size)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1) # probas 的形状为(batch, vocab_size)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # idx_next 的形状为(batch, 1)
        idx = torch.cat((idx, idx_next), dim=1) # 将计算出的下一个字符的索引添加 到索引数组中,此时 idx 的形状会变为(batch, n_tokens+1)
    return idx


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # 使用 .unsqueeze(0) 添加batch维度
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # 移除batch维度
    return tokenizer.decode(flat.tolist())


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

# 交叉熵损失
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
            logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                    input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}


torch.manual_seed(123)

model = GPTModel(GPT_CONFIG_124M)
model.eval() # 将模型设置为.eval()模式,禁用诸如 dropout 等只在训练期间使用的随机组件
tokenizer = tiktoken.get_encoding("gpt2")


file_path = "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

train_loader = create_dataloader_v1(
        train_data,
        batch_size=2,
        max_length=GPT_CONFIG_124M["context_length"],
        stride=GPT_CONFIG_124M["context_length"],
        drop_last=True,
        shuffle=True,
        num_workers=0
)

val_loader = create_dataloader_v1(
        val_data,
        batch_size=2,
        max_length=GPT_CONFIG_124M["context_length"],
        stride=GPT_CONFIG_124M["context_length"],
        drop_last=False,
        shuffle=False,
        num_workers=0
)

print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)
print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

"""
Characters: 20479
Tokens: 5145
Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])

Training loss: 10.987581782870823
Validation loss: 10.981117248535156
"""

代码清单5-3 预训练

步骤:从遍历每个训练轮次开始,处理批次,重置梯度,计算损失和新梯度,更新权重,最后以监控步骤(包括打印损失、生成文本样本等操作)结束。

在每次循环中,计算每个训练集批次的损失以确定损失梯度,然后使用这些梯度来更新模型权重,以使训练集损失最小化

import torch
import torch.nn as nn
import tiktoken
from torch.utils.data import Dataset, DataLoader

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # 在注意力块中添加快捷连接
        """
        层归一化(LayerNorm)应用于这两个组件之前,而 dropout 应用于这两个组件之后,
        以便对模型进行正则化并防止过拟合。这种方法也被称为前层归一化(Pre-LayerNorm)。
        """
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        """
        每个组件后面都跟着一个快捷连接,将块 的输入加到其输出上。
        这个关键特性有助于在训练过程中使梯度在网络中流动,并改善深度模型的学习效果
        """
        x = x + shortcut # 将原始输入添加回来

        shortcut = x # 在前馈层中添加快捷链接
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 将原始输入添加回来
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # 词元嵌入层和位置嵌入层负责将输入的词元索引转换为稠密向量,并添加位置信息
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # LayerNorm层将Transformer块的输出标准化,以稳定学习过程
        self.final_norm = LayerNorm(cfg["emb_dim"])
        # 无偏置的线性输出头,将Transformer的输出投影到分词器的词汇空间,为词汇中的每个词元生成分数logits
        self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx) # 词元嵌入层
        pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
        ) # 位置嵌入层
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x) # dropout
        x = self.trf_blocks(x) # transformer blocks
        x = self.final_norm(x) # 最终层归一化
        # 计算logits,这些logits代表下一个词元的非归一化概率
        # 最后一个线性层会将每个词元向量嵌入 50257 维的嵌入层,其中 50257 是词汇表的大小
        # 这个线性输出层将 Transformer 的输出映射到一个高维空间,以预测序列中的下一个词元
        logits = self.out_head(x) # 线性输出层
        return logits
        
# 入参 idx 是当前文本的索引数组,其形状为(batch, n_tokens)
def generate_text_simple(model, idx, 
            max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # 将当前文本截断至支持的长度。如果大语言 模型仅支持 5 个词元,
        # 但此时文本长度为 10, 则只有最后 5 个词元会被用作输入文本
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        # 只关注最后一个输出的内容,因此形状会从(batch, n_token, vocab_size) 变为(batch, vocab_size)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1) # probas 的形状为(batch, vocab_size)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # idx_next 的形状为(batch, 1)
        idx = torch.cat((idx, idx_next), dim=1) # 将计算出的下一个字符的索引添加 到索引数组中,此时 idx 的形状会变为(batch, n_tokens+1)
    return idx


def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # 使用 .unsqueeze(0) 添加batch维度
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # 移除batch维度
    return tokenizer.decode(flat.tolist())


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

# 交叉熵损失
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
            logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                    input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

def train_model_simple(model, train_loader, val_loader,
        optimizer, device, num_epochs, 
        eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        # 模型切换到训练模式
        model.train()
        for input_batch, target_batch in train_loader:
            # 1. 从上一个批次迭代中重置损失梯度
            optimizer.zero_grad() # 重置上一个批次迭代中的损失梯度
            # 2. 计算当前批次的损失
            loss = calc_loss_batch(
                    input_batch, target_batch, model, device
            ) 
            # 3. 反向传播以计算损失梯度
            loss.backward() 
            # 4. 使用损失梯度更新模型权重
            optimizer.step() 
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0: # 可选的评估步骤,用于跟踪训练进度
                train_loss, val_loss = evaluate_model(
                        model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}"
                )
        generate_and_print_sample( # 可选的步骤,生成文本样本用于可视化
                model, tokenizer, device, start_context
        )
    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    # 模型切换到评估模式
    model.eval() # 在评估阶段禁用 dropout, 以产出稳定且可复现的结果
    with torch.no_grad(): # 评估阶段也会禁用梯度跟踪,因为这是不需要的,而且这样可以减少计算开销
        train_loss = calc_loss_loader(
                train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(
                val_loader, model, device, num_batches=eval_iter
        )
    model.train() # 恢复训练模式,开启 dropout
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
                model=model, idx=encoded,
                max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()


import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax1.legend()

    ax2 = ax1.twiny() # 创建共享同一个y轴的第二个x轴
    ax2.plot(tokens_seen, train_losses, alpha=0) # 对齐刻度线的隐藏图标
    ax2.set_xlabel("Tokens seen")
    fig.tight_layout()
    plt.savefig("pretrain")
    plt.show()


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}


file_path = "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

train_loader = create_dataloader_v1(
        train_data,
        batch_size=2,
        max_length=GPT_CONFIG_124M["context_length"],
        stride=GPT_CONFIG_124M["context_length"],
        drop_last=True,
        shuffle=True,
        num_workers=0
)

val_loader = create_dataloader_v1(
        val_data,
        batch_size=2,
        max_length=GPT_CONFIG_124M["context_length"],
        stride=GPT_CONFIG_124M["context_length"],
        drop_last=False,
        shuffle=False,
        num_workers=0
)


torch.manual_seed(123)

tokenizer = tiktoken.get_encoding("gpt2")
model = GPTModel(GPT_CONFIG_124M)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

"""
Adam 优化器是训练深度神经网络的一种常见选择。
AdamW 是 Adam 的一个变体,它改进了权重衰减方法,旨在通过对较大的
权重进行惩罚来最小化模型复杂性并防止过拟合。
这种调整使得 AdamW 能够实现更有效的正则化和更好的泛化能力。
"""
optimizer = torch.optim.AdamW(
        model.parameters(), # .parameters()方法返回模型的所有可训练权重参数
        lr=0.0004,
        weight_decay=0.1
)

num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
        model, train_loader, val_loader, optimizer, device,
        num_epochs=num_epochs, eval_freq=5, eval_iter=5,
        start_context="Every effort moves you", tokenizer=tokenizer
)


epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)


"""
Ep 1 (Step 000000): Train loss 9.818, Val loss 9.929
Ep 1 (Step 000005): Train loss 8.065, Val loss 8.336
Every effort moves you,,,,,,,,,,,,.                                     
Ep 2 (Step 000010): Train loss 6.622, Val loss 7.052
Ep 2 (Step 000015): Train loss 6.047, Val loss 6.600
Every effort moves you, and,, and,,,,,,, and,.                                   
Ep 3 (Step 000020): Train loss 5.590, Val loss 6.476
Ep 3 (Step 000025): Train loss 5.539, Val loss 6.403
Every effort moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and
Ep 4 (Step 000030): Train loss 5.168, Val loss 6.384
Ep 4 (Step 000035): Train loss 4.980, Val loss 6.377
Every effort moves you a a so a a a of the picture. Gisburn. Gisburn, and a, and a.            "I the picture of the of the picture.    
Ep 5 (Step 000040): Train loss 4.349, Val loss 6.256
Every effort moves you, I had been a--as of the--as of the of the of the, I had been--and it's had been, in the of the of the picture, in the picture. "I had been the picture of the of
Ep 6 (Step 000045): Train loss 4.027, Val loss 6.212
Ep 6 (Step 000050): Train loss 3.534, Val loss 6.149
Every effort moves you know the                                                
Ep 7 (Step 000055): Train loss 3.549, Val loss 6.167
Ep 7 (Step 000060): Train loss 2.736, Val loss 6.133
Every effort moves you know the fact, and I felt of the picture, and I felt.             "I he was his pictures-c.             
Ep 8 (Step 000065): Train loss 2.292, Val loss 6.147
Ep 8 (Step 000070): Train loss 1.953, Val loss 6.210
Every effort moves you know," was not that the picture.  "I had the last word. Gisburn's an!     "Oh, I was _rose at my elbow and I had the donkey. "There were days when I
Ep 9 (Step 000075): Train loss 1.576, Val loss 6.218
Ep 9 (Step 000080): Train loss 1.239, Val loss 6.236
Every effort moves you know," was not that my hostess was "interesting": on that Mrs. "Yes--and by me to me to have to see a smile behind his pictures--as he had been his painting, a _j--because he had been his
Ep 10 (Step 000085): Train loss 0.953, Val loss 6.301
Every effort moves you?"  "Yes--quite insensible to the irony. She wanted him vindicated--and by me!"     "I didn't face that he had married her--the quality of the a fashionable painter--and by holding
"""

  • 在训练开始阶段,训练集损失和验证集损失急剧下降,这表明模型正在学习。
  • 然而,在第二轮之后,训练集损失继续下降,验证集损失则停滞不前。这表明模型仍在学习,但在第二轮之后开始对训练集过拟合,经常逐字记忆训练集中的段落。(验证集损失远大于训练集损失,表明模型对训练数据过拟合。)

章节5.3.1 温度缩放

温度缩放指的是将 logits 5 除以一个大于 0 的数

next_token_logits = torch.tensor(
         [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)
probas = torch.softmax(next_token_logits, dim=0)
next_token_id = torch.argmax(probas).item()
print(next_token_id)

torch.manual_seed(123)
next_token_id = torch.multinomial(probas, num_samples=1).item() 
print(next_token_id)

def print_sampled_tokens(probas):
    torch.manual_seed(123)
    sample = [torch.multinomial(probas, num_samples=1).item()
        for i in range(1_000)]
    sampled_ids = torch.bincount(torch.tensor(sample))
    print("sampled_ids:", sampled_ids)
    for i, freq in enumerate(sampled_ids):
        print(f"{freq} x {i}")

print_sampled_tokens(probas)

def softmax_with_temperature(logits, temperature):
    # 温度缩放指的是将 logits 除以一个大于 0 的数
    scaled_logits = logits / temperature
    return torch.softmax(scaled_logits, dim=0)


vocab = {
    "closer": 0,
    "every": 1,
    "effort": 2,
    "forward": 3,
    "inches": 4,
    "moves": 5,
    "pizza": 6,
    "toward": 7,
    "you": 8,
}
inverse_vocab = {v: k for k, v in vocab.items()}
temperatures = [1, 0.1, 5]
scaled_probas = [softmax_with_temperature(next_token_logits, T) for T in temperatures]
x = torch.arange(len(vocab))
bar_width = 0.15

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5,3))
for i, T in enumerate(temperatures):
    rects = ax.bar(x + i * bar_width, scaled_probas[i], 
        bar_width, label=f'Temperature = {T}')
ax.set_ylabel('Probability')
ax.set_xticks(x)
ax.set_xticklabels(vocab.keys(), rotation=90)
ax.legend()
plt.tight_layout()
plt.savefig("argmax_multinomial")
plt.show()

  • 温度为 1 表示词汇表中每个词元的未缩放概率分数。
  • 将温度降低到 0.1 会使分布更加集中,因此最可能的词元(这里是 forward)将具有更高的概率分数,接近于 argmax 函数的行为。
  • 同样,将温度提高到 5 会使分布更加均匀,使得其他词元更容易被选中。这可以为生成的文本增加更多变化, 但也更容易生成无意义的文本。

章节5.3.2 Top-k采样

通过与概率采样和温度缩放相结合,Top-k 采样可以改善文本生成结果。在 Top-k 采样中, 可以将采样的词元限制在前 k 个最可能的词元上,并通过掩码概率分数的方式来排除其他词元。

Top-k 方法用负无穷值(-inf)替换所有未选择的 logits,因此在计算 softmax 值时,非前 k 词元的概率分数为 0,剩余的概率总和为 1。(在实现因果注意力模块中使用过这种掩码技巧。)

import torch

next_token_logits = torch.tensor(
         [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)
top_k = 3
top_logits, top_pos = torch.topk(next_token_logits, top_k)
print("Top logits: ", top_logits)
print("Top positions: ", top_pos)
new_logits = torch.where(
    condition=next_token_logits < top_logits[-1], # 识别出比前3个logits值中最低的logits值还低的logits值
    input=torch.tensor(float('-inf')), # 给这些更低的logits值赋值-inf
    other=next_token_logits # 保留所有其他词元的原始logits值
)
print(new_logits)
topk_probas = torch.softmax(new_logits, dim=0)
print(topk_probas)

"""
Top logits:  tensor([6.7500, 6.2800, 4.5100])
Top positions:  tensor([3, 7, 0])
tensor([4.5100,   -inf,   -inf, 6.7500,   -inf,   -inf,   -inf, 6.2800,   -inf])
tensor([0.0615, 0.0000, 0.0000, 0.5775, 0.0000, 0.0000, 0.0000, 0.3610, 0.0000])
"""

代码清单5-4 结合温度缩放和top-k采样的文本生成



def generate(model, idx, max_new_tokens, context_size,
        temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :] # 只关注最后一个时间步
        # 使用top-k采样筛选logits
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(
                    condition=logits < min_val,
                    input=torch.tensor(float('-inf')).to(logits.device),
                    other=logits
            )
        # 使用温度缩放
        if temperature > 0.0:
            logits = logits / temperature
            
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values

            probas = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probas, num_samples=1)
        else:
            # 当禁用温度缩放时,执行贪心解码,选取下一个词元
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        # 如果遇到序列结束词元,则提前停止生成
        if idx_next == eos_id:
            break
        idx = torch.cat((idx, idx_next), dim=1)
    return idx



model.eval()
token_ids = generate(
    model=model,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
    max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
)

章节5.5 从OpenAI加载GPT-2模型权重

import torch
import torch.nn as nn
import tiktoken
from torch.utils.data import Dataset, DataLoader
import numpy as np

import json
import os
import tensorflow as tf
def download_and_load_gpt2(model_size, models_dir):
    # Validate model size
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")

    # Define paths
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    # Download files
    # os.makedirs(model_dir, exist_ok=True)
    # for filename in filenames:
    #     file_url = os.path.join(base_url, model_size, filename)
    #     file_path = os.path.join(model_dir, filename)
    #     download_file(file_url, file_path)

    # Load settings and params
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    settings = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

    return settings, params


def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params



class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头,其中 self.d_out = self.num_heads * self.head_dim
        #context_vec = context_vec.contiguous().view(
        #        b, num_tokens, self.d_out
        #) # (2, 6, 2, 1) => (2, 6, 2)
        context_vec = context_vec.reshape(b, num_tokens, self.d_out)

        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
                nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
                GELU(),
                nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self, x):
        return self.layers(x)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
                d_in=cfg["emb_dim"],
                d_out=cfg["emb_dim"],
                context_length=cfg["context_length"],
                num_heads=cfg["n_heads"],
                dropout=cfg["drop_rate"],
                qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x # 在注意力块中添加快捷连接
        """
        层归一化(LayerNorm)应用于这两个组件之前,而 dropout 应用于这两个组件之后,
        以便对模型进行正则化并防止过拟合。这种方法也被称为前层归一化(Pre-LayerNorm)。
        """
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        """
        每个组件后面都跟着一个快捷连接,将块 的输入加到其输出上。
        这个关键特性有助于在训练过程中使梯度在网络中流动,并改善深度模型的学习效果
        """
        x = x + shortcut # 将原始输入添加回来

        shortcut = x # 在前馈层中添加快捷链接
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut # 将原始输入添加回来
        return x

class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # 词元嵌入层和位置嵌入层负责将输入的词元索引转换为稠密向量,并添加位置信息
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
                *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        # LayerNorm层将Transformer块的输出标准化,以稳定学习过程
        self.final_norm = LayerNorm(cfg["emb_dim"])
        # 无偏置的线性输出头,将Transformer的输出投影到分词器的词汇空间,为词汇中的每个词元生成分数logits
        self.out_head = nn.Linear(
                cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx) # 词元嵌入层
        pos_embeds = self.pos_emb(
                torch.arange(seq_len, device=in_idx.device)
        ) # 位置嵌入层
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x) # dropout
        x = self.trf_blocks(x) # transformer blocks
        x = self.final_norm(x) # 最终层归一化
        # 计算logits,这些logits代表下一个词元的非归一化概率
        # 最后一个线性层会将每个词元向量嵌入 50257 维的嵌入层,其中 50257 是词汇表的大小
        # 这个线性输出层将 Transformer 的输出映射到一个高维空间,以预测序列中的下一个词元
        logits = self.out_head(x) # 线性输出层
        return logits
        
# 入参 idx 是当前文本的索引数组,其形状为(batch, n_tokens)
def generate_text_simple(model, idx, 
            max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # 将当前文本截断至支持的长度。如果大语言 模型仅支持 5 个词元,
        # 但此时文本长度为 10, 则只有最后 5 个词元会被用作输入文本
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        # 只关注最后一个输出的内容,因此形状会从(batch, n_token, vocab_size) 变为(batch, vocab_size)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1) # probas 的形状为(batch, vocab_size)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True) # idx_next 的形状为(batch, 1)
        idx = torch.cat((idx, idx_next), dim=1) # 将计算出的下一个字符的索引添加 到索引数组中,此时 idx 的形状会变为(batch, n_tokens+1)
    return idx


def generate(model, idx, max_new_tokens, context_size,
        temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :] # 只关注最后一个时间步
        # 使用top-k采样筛选logits
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(
                    condition=logits < min_val,
                    input=torch.tensor(float('-inf')).to(logits.device),
                    other=logits
            )
        # 使用温度缩放
        if temperature > 0.0:
            logits = logits / temperature
            
            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values

            probas = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probas, num_samples=1)
        else:
            # 当禁用温度缩放时,执行贪心解码,选取下一个词元
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        # 如果遇到序列结束词元,则提前停止生成
        if idx_next == eos_id:
            break
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # 使用 .unsqueeze(0) 添加batch维度
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # 移除batch维度
    return tokenizer.decode(flat.tolist())


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

# 交叉熵损失
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
            logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                    input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

def train_model_simple(model, train_loader, val_loader,
        optimizer, device, num_epochs, 
        eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        # 模型切换到训练模式
        model.train()
        for input_batch, target_batch in train_loader:
            # 1. 从上一个批次迭代中重置损失梯度
            optimizer.zero_grad() # 重置上一个批次迭代中的损失梯度
            # 2. 计算当前批次的损失
            loss = calc_loss_batch(
                    input_batch, target_batch, model, device
            ) 
            # 3. 反向传播以计算损失梯度
            loss.backward() 
            # 4. 使用损失梯度更新模型权重
            optimizer.step() 
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0: # 可选的评估步骤,用于跟踪训练进度
                train_loss, val_loss = evaluate_model(
                        model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}"
                )
        generate_and_print_sample( # 可选的步骤,生成文本样本用于可视化
                model, tokenizer, device, start_context
        )
    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    # 模型切换到评估模式
    model.eval() # 在评估阶段禁用 dropout, 以产出稳定且可复现的结果
    with torch.no_grad(): # 评估阶段也会禁用梯度跟踪,因为这是不需要的,而且这样可以减少计算开销
        train_loss = calc_loss_loader(
                train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(
                val_loader, model, device, num_batches=eval_iter
        )
    model.train() # 恢复训练模式,开启 dropout
    return train_loss, val_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval() # 模型切换到评估模式,关闭 dropout 之类的随机组件
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
                model=model, idx=encoded,
                max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " ")) # Compact print format
    model.train()


import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
    ax1.legend(loc="upper right")

    ax2 = ax1.twiny() # 创建共享同一个y轴的第二个x轴
    ax2.plot(tokens_seen, train_losses, alpha=0) # 对齐刻度线的隐藏图标
    ax2.set_xlabel("Tokens seen")
    fig.tight_layout()
    plt.savefig("pretrain")
    #plt.show()


def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                          "Right: {right.shape}"
        )
    # 返回可训练的PyTorch参数
    return torch.nn.Parameter(torch.tensor(right))

# 将模型的位置信息和词元嵌入权重设置为 params 中指定的值
def load_weights_into_gpt(gpt, params):
    # pos_emb = nn.Embedding(...)
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    # tok_emb = nn.Embedding(...)
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])): # 遍历模型中的每一个 Transformer 块
        # np.split 将注意力和偏置权重 平均分为3个部分,分别用于查询组件、健组件和值组件
        q_w, k_w, v_w = np.split(
                (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        # Attention().W_query = nn.Linear(...)
        gpt.trf_blocks[b].att.W_query.weight = assign(
                gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
                gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
                gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
                (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
                gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
                gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
                gpt.trf_blocks[b].att.W_value.bias, v_b)

        # Attention().out_proj = nn.Linear(...)
        gpt.trf_blocks[b].att.out_proj.weight = assign(
                gpt.trf_blocks[b].att.out_proj.weight,
                params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
                gpt.trf_blocks[b].att.out_proj.bias,
                params["blocks"][b]["attn"]["c_proj"]["b"])

        # FeedForward().layers = nn.Sequential(nn.Linear(...), GELU(), nn.Linear(...))
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
                gpt.trf_blocks[b].ff.layers[0].weight,
                params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
                gpt.trf_blocks[b].ff.layers[0].bias,
                params["blocks"][b]["mlp"]["c_fc"]["b"])

        gpt.trf_blocks[b].ff.layers[2].weight = assign(
                gpt.trf_blocks[b].ff.layers[2].weight,
                params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
                gpt.trf_blocks[b].ff.layers[2].bias,
                params["blocks"][b]["mlp"]["c_proj"]["b"])

        # norm1 = LayerNorm(...)
        #   LayerNorm().scale = nn.Parameter(...)
        #   LayerNorm().shift = nn.Parameter(...)
        gpt.trf_blocks[b].norm1.scale = assign(
                gpt.trf_blocks[b].norm1.scale,
                params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
                gpt.trf_blocks[b].norm1.shift,
                params["blocks"][b]["ln_1"]["b"])
        # norm2 = LayerNorm(...)
        gpt.trf_blocks[b].norm2.scale = assign(
                gpt.trf_blocks[b].norm2.scale,
                params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
                gpt.trf_blocks[b].norm2.shift,
                params["blocks"][b]["ln_2"]["b"])

    # final_norm = LayerNorm(...)
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    # out_head = nn.Linear(...)
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

model_configs = {
        "gpt2-small-124M": {"emb_dim":768, "n_layers":12, "n_heads":12},
        "gpt2-medium-355M": {"emb_dim":1024, "n_layers":24, "n_heads":16},
        "gpt2-large-774M": {"emb_dim":1280, "n_layers":36, "n_heads":20},
        "gpt2-xl-1558M": {"emb_dim":1600, "n_layers":48, "n_heads":25},
}
model_name = "gpt2-small-124M"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024})
NEW_CONFIG.update({"qkv_bias": True})

# from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

gpt = GPTModel(NEW_CONFIG)
load_weights_into_gpt(gpt, params)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt.to(device)
gpt.eval()

tokenizer = tiktoken.get_encoding("gpt2")

torch.manual_seed(123)
token_ids = generate(
        model=gpt,
        idx=text_to_token_ids("Every effort moves you", tokenizer).to(device),
        max_new_tokens=25,
        context_size=NEW_CONFIG["context_length"],
        top_k=50,
        temperature=1.0
)
print("Output:\n", token_ids_to_text(token_ids, tokenizer))

"""
Output:
 Every effort moves you as far as the eye can see. (That's because you're only going to see it at those locations where you know
"""

Chapter06-P172 只对最后一个输出词元特别感兴趣

  • 注意力机制,建立了每个输入词元与其他输入词元之间的关系
  • 因果注意力掩码,限制了一个词元的关注范围,只能关注当前及之前的位置,从而确保每个词元只受自己和之前词元的影响。
  • 序列中的最后一个词元,累积了最多的信息,因为它是唯一一个可以访问之前所有数据的词元,是唯一一个计算前面所有词元注意力分数的词元。

章节6.5 修改模型以进行分类的微调,添加分类头

模型设置

  • 冻结模型:将所有层设为不可训练
    • for param in model.parameters():

    • param.requires_grad = False

  • 添加分类层

    • model.out_head = torch.nn.Linear(in_features=BASE_CONFIG["emb_dim"], 
      out_features=num_classes)
      
  • 将输出层、最后一个Transformer块和连接该块到输出层的最终归一化层,设置为可训练
    • for param in model.final_norm.parameters():
      param.requires_grad = True
      

CHOOSE_MODEL = "gpt2-small-124M"
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
        "vocab_size": 50257,
        "context_length": 1024,
        "drop_rate": 0.0,
        "qkv_bias": True
}
model_configs = {
        "gpt2-small-124M": {"emb_dim":768, "n_layers":12, "n_heads":12},
        "gpt2-medium-355M": {"emb_dim":1024, "n_layers":24, "n_heads":16},
        "gpt2-large-774M": {"emb_dim":1280, "n_layers":36, "n_heads":20},
        "gpt2-xl-1558M": {"emb_dim":1600, "n_layers":48, "n_heads":25},
}
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split("-")[-1]
settings, params = download_and_load_gpt2(
        model_size=model_size, models_dir="gpt2"
)
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)


# 为了使模型准备好进行分类微调,首先冻结模型,即将所有层设为不可训练。
for param in model.parameters():
    param.requires_grad = False

torch.manual_seed(123)
num_classes = 2
# 这个新的 model.out_head 输出层的 requires_grad 属性默认设置为 True,
# 这意味着它是模型中唯一在训练过程中会被更新的层。
model.out_head = torch.nn.Linear(
        in_features=BASE_CONFIG["emb_dim"],
        out_features=num_classes
)
# 实验中发现,微调额外的层可以显著提升模型的预测性能。
# 我们还将最后一个 Transformer 块和连接该块到输出层的最终层归一化模块设置为可训练
for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

inputs = tokenizer.encode("Do you have time")
inputs = torch.tensor(inputs).unsqueeze(0).to(device)
print("Inputs:", inputs.shape, inputs)
with torch.no_grad():
    outputs = model(inputs)
print("Outputs:", outputs, outputs.shape)
print("Last output token:", outputs[:,-1:])

probas = torch.softmax(outputs[:,-1,:], dim=-1)
label = torch.argmax(probas)
print("Class label:", label.item())

章节6.7 在有监督数据上微调模型

模型微调

  1. 重置损失梯度 optimizer.zero_grad()
  2. 计算损失 torch.nn.functional.cross_entropy(logits, target_batch)
  3. 反向传播计算损失梯度 loss.backward()
  4. 使用损失梯度更新模型权重 optimizer.step()

    import torch
    from torch.utils.data import Dataset, DataLoader
    import tiktoken
    import pandas as pd
    
    from gpt_download import download_and_load_gpt2
    from chapter05 import GPTModel, load_weights_into_gpt, text_to_token_ids, token_ids_to_text, generate_text_simple
    
    class SpamDataset(Dataset):
        def __init__(self, csv_file, tokenizer, max_length=None,
                pad_token_id=50256):
            self.data = pd.read_csv(csv_file)
    
            self.encoded_texts = [
                    tokenizer.encode(text) for text in self.data["Text"]
            ] # 文本分词
    
            if max_length is None:
                self.max_length = self._longest_encoded_length()
            else:
                self.max_length = max_length
    
                self.encoded_texts = [
                        encoded_text[:self.max_length]
                        for encoded_text in self.encoded_texts
                ] # 如果序列长度超过 max_length, 则进行截断
    
            self.encoded_texts = [
                    encoded_text + [pad_token_id] * 
                    (self.max_length - len(encoded_text))
                    for encoded_text in self.encoded_texts
            ] # 填充到最长序列的长度
    
        def __getitem__(self, index):
            encoded = self.encoded_texts[index]
            label = self.data.iloc[index]["Label"]
            return (
                    torch.tensor(encoded, dtype=torch.long),
                    torch.tensor(label, dtype=torch.long)
            )
    
        def __len__(self):
            return len(self.data)
    
        def _longest_encoded_length(self):
            max_length = 0
            for encoded_text in self.encoded_texts:
                encoded_length = len(encoded_text)
                if encoded_length > max_length:
                    max_length = encoded_length
            return max_length
    
    def calc_accuracy_loader(data_loader, model, device, num_batches=None):
        model.eval()
        correct_predictions, num_examples = 0, 0
    
        if num_batches is None:
            num_batches = len(data_loader)
        else:
            num_batches = min(num_batches, len(data_loader))
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i < num_batches:
                input_batch = input_batch.to(device)
                target_batch = target_batch.to(device)
    
                with torch.no_grad():
                    logits = model(input_batch)[:, -1, :]
                predicted_labels = torch.argmax(logits, dim=-1)
                num_examples += predicted_labels.shape[0]
                correct_predictions += (
                        (predicted_labels == target_batch).sum().item()
                )
            else:
                break
        return correct_predictions / num_examples
    
    def calc_loss_batch(input_batch, target_batch, model, device):
        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)
        logits = model(input_batch)[:, -1, :]
        loss = torch.nn.functional.cross_entropy(logits, target_batch)
        #print("shape:", logits.shape, target_batch.shape, loss.shape, loss)
        return loss
    
    def calc_loss_loader(data_loader, model, device, num_batches=None):
        total_loss = 0.
        if len(data_loader) == 0:
            return float("nan")
        elif num_batches is None:
            num_batches = len(data_loader)
        else:
            num_batches = min(num_batches, len(data_loader))
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i < num_batches:
                loss = calc_loss_batch(
                        input_batch, target_batch, model, device
                )
                total_loss += loss.item()
            else:
                break
        return total_loss / num_batches
    
    def train_classifier_simple(
            model, train_loader, val_loader, optimizer, device,
            num_epochs, eval_freq, eval_iter):
        train_losses, val_losses, train_accs, val_accs = [], [], [], []
        examples_seen, global_step = 0, -1
        for epoch in range(num_epochs): # 主训练循环
            model.train() # 设置模型为训练模式
            for input_batch, target_batch in train_loader:
                optimizer.zero_grad() # 重置上一次批次迭代的损失梯度
                loss = calc_loss_batch(
                        input_batch, target_batch, model, device
                )
                loss.backward() # 反向传播以计算损失梯度
                optimizer.step() # 使用损失梯度更新权重
                examples_seen += input_batch.shape[0]
                global_step += 1
    
                if global_step % eval_freq == 0: # 可选的评估步骤
                    train_loss, val_loss = evaluate_model(
                            model, train_loader, val_loader, device, eval_iter)
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                    print(f"Ep {epoch+1} (Step {global_step:06d}): "
                            f"Train loss {train_loss:.3f}, "
                            f"Val loss {val_loss:.3f}"
                    )
            train_accuracy = calc_accuracy_loader( # 每轮训练后计算准确率
                    train_loader, model, device, num_batches=eval_iter
            )
            val_accuracy = calc_accuracy_loader(
                    val_loader, model, device, num_batches=eval_iter
            )
            print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
            print(f"Validation accuracy: {val_accuracy*100:.2f}%")
            train_accs.append(train_accuracy)
            val_accs.append(val_accuracy)
        return train_losses, val_losses, train_accs, val_accs, examples_seen
    
    def evaluate_model(model, train_loader, val_loader, device, eval_iter):
        model.eval()
        with torch.no_grad():
            train_loss = calc_loss_loader(
                    train_loader, model, device, num_batches=eval_iter
            )
            val_loss = calc_loss_loader(
                    val_loader, model, device, num_batches=eval_iter
            )
        model.train()
        return train_loss, val_loss
    
    
    tokenizer = tiktoken.get_encoding("gpt2")
    
    train_dataset = SpamDataset(
            csv_file="train.csv",
            max_length=None,
            tokenizer=tokenizer
    )
    print(train_dataset.max_length)
    val_dataset = SpamDataset(
            csv_file="validation.csv",
            max_length=train_dataset.max_length,
            tokenizer=tokenizer
    )
    test_dataset = SpamDataset(
            csv_file="test.csv",
            max_length=train_dataset.max_length,
            tokenizer=tokenizer
    )
    print(f"train {len(train_dataset)}, validation {len(val_dataset)}, test {len(test_dataset)}")
    
    num_workers = 0
    batch_size = 8
    torch.manual_seed(123)
    train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers,
            drop_last=True,
    )
    val_loader = DataLoader(
            dataset=val_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            drop_last=False,
    )
    test_loader = DataLoader(
            dataset=test_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            drop_last=False,
    )
    
    
    CHOOSE_MODEL = "gpt2-small-124M"
    INPUT_PROMPT = "Every effort moves"
    BASE_CONFIG = {
            "vocab_size": 50257,
            "context_length": 1024,
            "drop_rate": 0.0,
            "qkv_bias": True
    }
    model_configs = {
            "gpt2-small-124M": {"emb_dim":768, "n_layers":12, "n_heads":12},
            "gpt2-medium-355M": {"emb_dim":1024, "n_layers":24, "n_heads":16},
            "gpt2-large-774M": {"emb_dim":1280, "n_layers":36, "n_heads":20},
            "gpt2-xl-1558M": {"emb_dim":1600, "n_layers":48, "n_heads":25},
    }
    BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
    
    model_size = CHOOSE_MODEL.split("-")[-1]
    settings, params = download_and_load_gpt2(
            model_size=model_size, models_dir="gpt2"
    )
    model = GPTModel(BASE_CONFIG)
    load_weights_into_gpt(model, params)
    
    
    # 为了使模型准备好进行分类微调,首先冻结模型,即将所有层设为不可训练。
    for param in model.parameters():
        param.requires_grad = False
    
    torch.manual_seed(123)
    num_classes = 2
    # 这个新的 model.out_head 输出层的 requires_grad 属性默认设置为 True,
    # 这意味着它是模型中唯一在训练过程中会被更新的层。
    model.out_head = torch.nn.Linear(
            in_features=BASE_CONFIG["emb_dim"],
            out_features=num_classes
    )
    # 实验中发现,微调额外的层可以显著提升模型的预测性能。
    # 我们还将最后一个 Transformer 块和连接该块到输出层的最终层归一化模块设置为可训练
    for param in model.trf_blocks[-1].parameters():
        param.requires_grad = True
    for param in model.final_norm.parameters():
        param.requires_grad = True
    
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    
    import time
    start_time = time.time()
    torch.manual_seed(123)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
    num_epochs = 5
    train_losses, val_losses, train_accs, val_accs, examples_seen = \
            train_classifier_simple(
                    model, train_loader, val_loader, optimizer, device,
                    num_epochs=num_epochs, eval_freq=50, eval_iter=5
            )
    end_time = time.time()
    execution_time_minutes = (end_time -start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    import matplotlib.pyplot as plt
    def plot_values(
            epochs_seen, examples_seen, train_values, val_values,
            label="loss"):
        # 绘制训练集损失和验证集损失与轮数的关联
        fig, ax1 = plt.subplots(figsize=(5, 3))
        ax1.plot(epochs_seen, train_values, label=f"Training {label}")
        ax1.plot(
                epochs_seen, val_values, linestyle="-.",
                label=f"Validation {label}"
        )
        ax1.set_xlabel("Epochs")
        ax1.set_ylabel(label.capitalize())
        ax1.legend()
    
        ax2 = ax1.twiny() # 为所见样本创建第二个x轴
        ax2.plot(examples_seen, train_values, alpha=0) # 不可见的图形用于对齐刻度
        ax2.set_xlabel("Examples seen")
    
        fig.tight_layout() # 调整布局以腾出空间
        plt.savefig(f"finetune-classify-{label}-plot")
        plt.show()
    
    # 绘制分类微调的损失曲线
    epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
    examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))
    plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)
    
    # 绘制分类的准确率图表
    epochs_tensor = torch.linspace(0, num_epochs, len(train_accs))
    examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs))
    plot_values(
            epochs_tensor, examples_seen_tensor, train_accs, val_accs, 
            label="accuracy"
    )
    
    # 计算整个数据集在训练集、验证集和测试集上的性能指标
    train_accuracy = calc_accuracy_loader(train_loader, model, device)
    val_accuracy = calc_accuracy_loader(val_loader, model, device)
    test_accuracy = calc_accuracy_loader(test_loader, model, device)
    print(f"Training accuracy: {train_accuracy*100:.2f}%")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    print(f"Test accuracy: {test_accuracy*100:.2f}%")
    
    # 保存模型
    torch.save(model.state_dict(), "review_classifier.pth")
    

    损失曲线

    准确率曲线

    输出

    train 1045, validation 149, test 300
    Ep 1 (Step 000000): Train loss 2.154, Val loss 2.393
    Ep 1 (Step 000050): Train loss 0.617, Val loss 0.637
    Ep 1 (Step 000100): Train loss 0.523, Val loss 0.558
    Training accuracy: 70.00% | Validation accuracy: 72.50%
    Ep 2 (Step 000150): Train loss 0.560, Val loss 0.488
    Ep 2 (Step 000200): Train loss 0.419, Val loss 0.396
    Ep 2 (Step 000250): Train loss 0.408, Val loss 0.353
    Training accuracy: 82.50% | Validation accuracy: 85.00%
    Ep 3 (Step 000300): Train loss 0.333, Val loss 0.321
    Ep 3 (Step 000350): Train loss 0.339, Val loss 0.306
    Training accuracy: 90.00% | Validation accuracy: 90.00%
    Ep 4 (Step 000400): Train loss 0.135, Val loss 0.198
    Ep 4 (Step 000450): Train loss 0.152, Val loss 0.131
    Ep 4 (Step 000500): Train loss 0.222, Val loss 0.136
    Training accuracy: 100.00% | Validation accuracy: 97.50%
    Ep 5 (Step 000550): Train loss 0.207, Val loss 0.143
    Ep 5 (Step 000600): Train loss 0.083, Val loss 0.073
    Training accuracy: 100.00% | Validation accuracy: 97.50%
    Training completed in 0.15 minutes.
    
    
    Training accuracy: 97.21%
    Validation accuracy: 97.32%
    Test accuracy: 95.67%

    通常,验证集的准确率会比测试集的准确率稍高,因为模型开发过程中往往会调整超参数以提升在验证集上的性能,这可能导致模型在测试集上并不完全适用。这种情况很常见,

    但可以通过调整模型设置(比如:增加 dropout 率 drop_rate 或优化器配置中的权重衰减参数 weight_decay)来尽量缩小这种差距。

    代码清单6-12 使用微调后的新模型对新的文本进行分类

    • 微调大语言模型有不同的策略:分类微调、指令微调
    • 分类微调:通过添加一个小型分类层来替换大预言模型的输出层
      • 在将文本消息分类为“垃圾消息”或“非垃圾消息”的例子中,新的分类层只有两个输出节点。
      • 之前,我们使用的输出节点数量与词汇表中的唯一词元数量相等(50256)个。
      • 修改模型
      1. 将所有层设为不可训练(param.requires_grad = False)
      2. 添加分类层,替换大预言模型的输出层
      3. 将最后一个transformer块、最终归一化层、输出层设为可训练(param.requires_grad = True)
      • 微调模型
      1. 重置损失梯度 optimizer.zero_grad()
      2. 计算损失 torch.nn.functional.cross_entropy(logits, target_batch)
      3. 反向传播计算损失梯度 loss.backward()
      4. 使用损失梯度更新模型权重 optimizer.step()

    • 分类模型的评估:包括数据集的损失、计算分类准确率(正确预测的比例或百分比)。
    • 分类模型的微调:使用与大语言模型预训练相同的交叉熵损失函数
      • logits = model(input_batch)[:, -1, :]

      • loss = torch.nn.functional.cross_entropy(logits, target_batch)

    
    def classify_review(
            text, model, tokenizer, device, max_length=None,
            pad_token_id = 50256):
        model.eval()
    
        # 准备模型的输入数据
        input_ids = tokenizer.encode(text)
        supported_context_length = model.pos_emb.weight.shape[1]
        input_ids = input_ids[:min(
            max_length, supported_context_length
        )] # 截断过长的序列
    
        input_ids += [pad_token_id] * (max_length - len(input_ids))
        input_tensor = torch.tensor(
                input_ids, device=device
        ).unsqueeze(0) # 添加批次维度
        with torch.no_grad(): # 推理时不需要计算梯度
            logits = model(input_tensor)[:, -1, :] # 最后一个输出词元的logits
        predicted_label = torch.argmax(logits, dim=-1).item()
        return "spam" if predicted_label == 1 else "not spam"
    
    
    
    CHOOSE_MODEL = "gpt2-small-124M"
    BASE_CONFIG = {
            "vocab_size": 50257,
            "context_length": 1024,
            "drop_rate": 0.0,
            "qkv_bias": True
    }
    model_configs = {
            "gpt2-small-124M": {"emb_dim":768, "n_layers":12, "n_heads":12},
            "gpt2-medium-355M": {"emb_dim":1024, "n_layers":24, "n_heads":16},
            "gpt2-large-774M": {"emb_dim":1280, "n_layers":36, "n_heads":20},
            "gpt2-xl-1558M": {"emb_dim":1600, "n_layers":48, "n_heads":25},
    }
    BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
    
    model_size = CHOOSE_MODEL.split("-")[-1]
    settings, params = download_and_load_gpt2(
            model_size=model_size, models_dir="gpt2"
    )
    model = GPTModel(BASE_CONFIG)
    
    torch.manual_seed(123)
    num_classes = 2
    # 这个新的 model.out_head 输出层的 requires_grad 属性默认设置为 True,
    # 这意味着它是模型中唯一在训练过程中会被更新的层。
    model.out_head = torch.nn.Linear(
            in_features=BASE_CONFIG["emb_dim"],
            out_features=num_classes
    )
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    
    # 微调后,保存模型
    # torch.save(model.state_dict(), "review_classifier.pth")
    
    # 加载微调后的新模型
    model_state_dict = torch.load("review_classifier.pth", map_location=device)
    model.load_state_dict(model_state_dict)
    
    # 计算整个数据集在训练集、验证集和测试集上的性能指标
    train_accuracy = calc_accuracy_loader(train_loader, model, device)
    val_accuracy = calc_accuracy_loader(val_loader, model, device)
    test_accuracy = calc_accuracy_loader(test_loader, model, device)
    print(f"Training accuracy: {train_accuracy*100:.2f}%")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    print(f"Test accuracy: {test_accuracy*100:.2f}%")
    
    text_1 = (
            "You are a winner you have been specially"
            " selected to receive $1000 cash or a $2000 award."
    )
    print(classify_review(text_1, model, tokenizer, device, max_length=train_dataset.max_length))
    text_2 = (
        "Hey, just wanted to check if we're still on"
        " for dinner tonight? Let me know!"
    )
    print(classify_review(text_2, model, tokenizer, device, max_length=train_dataset.max_length))
    
    

    输出

    Training accuracy: 97.21%
    Validation accuracy: 97.32%
    Test accuracy: 95.67%
    spam
    not spam

    章节7.1 指令微调

    微调大语言模型的两种主要方式:

    1. 分类微调(classification finetuning):用于文本分类的微调
    2. 指令微调(instruction finetuning):微调大预言模型以遵循人类指令

    指令微调的步骤:

    1. 准备数据集
    2. 模型配置和微调
    3. 评估大语言模型

    import torch
    from torch.utils.data import Dataset, DataLoader
    import tiktoken
    from functools import partial
    
    import time
    import json
    import re
    from tqdm import tqdm
    
    from gpt_download import download_and_load_gpt2
    from previous_chapters_for_ch07 import GPTModel, load_weights_into_gpt, calc_loss_loader, train_model_simple, plot_losses, generate, text_to_token_ids, token_ids_to_text
    
    """
    示例输入entry:
    {
        'instruction': 'Identify the correct spelling of the following word.', 
        'input': 'Ocassion', 
        'output': "The correct spelling is 'Occasion.'"
    }
    示例返回:
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
    
    ### Instruction:
    Identify the correct spelling of the following word.
    
    ### Input:
    Ocassion
    
    ### Response:
    The correct spelling is 'Occasion.'
    """
    # 使用提示词模版制作格式化数据
    # 将输入prompt 格式化为 指令-回复 模版
    def format_input(entry):
        instruction_text = (
                f"Below is an instruction that describes a task. "
                f"Write a response that appropriately completes the request."
                f"\n\n### Instruction:\n{entry['instruction']}"
        )
        input_text = (
                f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
        )
        return instruction_text + input_text
    
    class InstructionDataset(Dataset):
        def __init__(self, data, tokenizer):
            self.data = data
            self.encoded_texts = []
    
            for entry in data:
                instruction_plus_input = format_input(entry)
                response_text = f"\n\n### Response:\n{entry['output']}"
                full_text = instruction_plus_input + response_text
                self.encoded_texts.append(
                        tokenizer.encode(full_text)
                )
    
        def __getitem__(self, index):
            return self.encoded_texts[index]
    
        def __len__(self):
            return len(self.data)
    
    # 获取 <|endoftext|> 的 token id
    # 使用该 token id作为填充词元,将所有输入填充到相似的长度
    # 输出: [50256]
    """
    tokenizer = tiktoken.get_encoding("gpt2")
    print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))
    """
    
    def custom_collate_fn(
            batch,
            pad_token_id=50256,
            ignore_index=-100,
            allowed_max_length=None,
            device="cpu"
    ):
        batch_max_length = max(len(item)+1 for item in batch)
        inputs_lst, targets_lst = [], []
    
        for item in batch:
            new_item = item.copy()
            new_item += [pad_token_id]
    
            padded = (
                    new_item + [pad_token_id] *
                    (batch_max_length - len(new_item))) # 将序列填充至 max_length
            inputs = torch.tensor(padded[:-1]) # 截断输入的最后一个词元
            targets = torch.tensor(padded[1:]) # 向左移动一个位置得到目标
    
            # 把目标序列中除第一个填充词元外的所有填充词元都替换为 ignore_index
            # 使得模型在学习时不计算这部分损失
            # e.g.: torch.nonzero(torch.tensor([False,  True,  True,  True,  True])).squeeze()
            mask = targets == pad_token_id
            indices = torch.nonzero(mask).squeeze()
            if indices.numel() > 1:
                targets[indices[1:]] = ignore_index
    
            # 截断至最大序列长度
            if allowed_max_length is not None:
                inputs = inputs[:allowed_max_length]
                tragets = targets[:allowed_max_length]
    
            inputs_lst.append(inputs)
            targets_lst.append(targets)
        inputs_tensor = torch.stack(inputs_lst).to(device)
        targets_tensor = torch.stack(targets_lst).to(device)
        return inputs_tensor, targets_tensor
    
    file_path = "instruction-data.json"
    with open(file_path, "r") as file:
        data = json.load(file)
    train_portion = int(len(data) * 0.85) # 使用 85%的数据作为训练集
    test_portion = int(len(data) * 0.1) # 使用 10%的数据 作为测试集
    val_portion = len(data) - train_portion - test_portion # 使用剩下的 5%的 数据作为验证集
    train_data = data[:train_portion]
    test_data = data[train_portion:train_portion+test_portion]
    val_data = data[train_portion + test_portion:]
    
    
    """
    inputs_1 = [0,1,2,3,4]
    inputs_2 = [5,6]
    inputs_3 = [7,8,9]
    batch = (inputs_1, inputs_2, inputs_3)
    inputs, targets = custom_collate_fn(batch)
    print(inputs)
    print(targets)
    """
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # if torch.backends.mps.is_available():
    #     device = torch.device("mps")"
    print("Device:", device)
    
    customized_collate_fn = partial(
            custom_collate_fn,
            device=device,
            allowed_max_length=1024
    )
    
    num_workers = 0 # 如果操作系统支持Python进程的并行,那么可以加大这个数值
    batch_size = 8
    
    tokenizer = tiktoken.get_encoding("gpt2")
    torch.manual_seed(123)
    
    train_dataset = InstructionDataset(train_data, tokenizer)
    train_loader = DataLoader(
            train_dataset,
            batch_size = batch_size,
            collate_fn = customized_collate_fn,
            shuffle = True,
            drop_last = True,
            num_workers = num_workers
    )
    
    val_dataset = InstructionDataset(val_data, tokenizer)
    val_loader = DataLoader(
            val_dataset,
            batch_size = batch_size,
            collate_fn = customized_collate_fn,
            shuffle = False,
            drop_last = False,
            num_workers = num_workers
    )
    
    test_dataset = InstructionDataset(test_data, tokenizer)
    test_loader = DataLoader(
            test_dataset,
            batch_size = batch_size,
            collate_fn = customized_collate_fn,
            shuffle = False,
            drop_last = False,
            num_workers = num_workers
    )
    
    """
    print("Train loader:")
    for inputs, targets in train_loader:
        print(inputs.shape, targets.shape)
        print("Inputs:\n", inputs[1], "\nTargets:\n", targets[1])
        print("Inputs:\n", tokenizer.decode(inputs[0].tolist()), "\nTargets:\n", tokenizer.decode(targets[0].tolist()))
        break
    """
    
    BASE_CONFIG = {
            "vocab_size": 50257,
            "context_length": 1024,
            "drop_rate": 0.0,
            "qkv_bias": True
    }
    model_configs = {
            "gpt2-small-124M": {"emb_dim":768, "n_layers":12, "n_heads":12},
            "gpt2-medium-355M": {"emb_dim":1024, "n_layers":24, "n_heads":16},
            "gpt2-large-774M": {"emb_dim":1280, "n_layers":36, "n_heads":20},
            "gpt2-xl-1558M": {"emb_dim":1600, "n_layers":48, "n_heads":25},
    }
    CHOOSE_MODEL = "gpt2-medium-355M"
    BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
    
    model_size = CHOOSE_MODEL.split("-")[-1]
    settings, params = download_and_load_gpt2(
            model_size=model_size, models_dir="gpt2"
    )
    model = GPTModel(BASE_CONFIG)
    load_weights_into_gpt(model, params)
    model.eval()
    
    """
    torch.manual_seed(123)
    # {
    #     "instruction": "Convert the active sentence to passive: 'The chef cooks the meal every day.'",
    #     "input": "",
    #     "output": "The meal is cooked by the chef every day."
    # },
    input_text = format_input(val_data[0])
    print("===input_text:", input_text)
    
    token_ids = generate(
            model = model,
            idx=text_to_token_ids(input_text, tokenizer),
            max_new_tokens=35,
            context_size=BASE_CONFIG["context_length"],
            eos_id=50256,
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    print("===generated_text:", generated_text)
    response_text = generated_text[len(input_text):].strip()
    print("===response_text:", response_text)
    """
    
    model.to(device)
    torch.manual_seed(123)
    with torch.no_grad():
        train_loss = calc_loss_loader(
                train_loader, model, device, num_batches=5
        )
        val_loss = calc_loss_loader(
                val_loader, model, device, num_batches=5
        )
    print("Training loss:", train_loss)
    print("Validation loss:", val_loss)
    
    start_time = time.time()
    torch.manual_seed(123)
    optimizer = torch.optim.AdamW(
            model.parameters(), lr=0.00005, weight_decay=0.1
    )
    num_epochs = 2
    
    train_losses, val_losses, tokens_seen = train_model_simple(
            model, train_loader, val_loader, optimizer, device,
            num_epochs=num_epochs, eval_freq=5, eval_iter=5,
            start_context=format_input(val_data[0]), tokenizer=tokenizer)
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    # 把模型保存到 gpt2-medium355M-sft.pth 文件中
    file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL) }-sft.pth" # 去除文件名中的空白字符和括号
    torch.save(model.state_dict(), file_name)
    print(f"Model saved as {file_name}")
    
    # 加载模型
    model.load_state_dict(torch.load(file_name, map_location=device))
    
    
    """
    epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
    plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses, "finetuning-instruction")
    """
    
    """
    torch.manual_seed(123)
    for entry in test_data[:3]:
        input_text = format_input(entry)
        token_ids = generate(
                model=model,
                idx = text_to_token_ids(input_text, tokenizer).to(device),
                max_new_tokens=256,
                context_size=BASE_CONFIG["context_length"],
                eos_id=50256
        )
        generated_text = token_ids_to_text(token_ids, tokenizer)
        response_text = (
                generated_text[len(input_text):].replace("### Response:", "")
                .strip()
        )
        print("input_text:", input_text)
        print("\ngenerated_text:", generated_text)
        print(f"\nCorrect response:\n>> {entry['output']}")
        print(f"\nModel response:\n>> {response_text.strip()}")
        print("-" * 20)
    """
    
    """
    for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
        input_text = format_input(entry)
        token_ids = generate(
                model=model,
                idx=text_to_token_ids(input_text, tokenizer).to(device),
                max_new_tokens=256,
                context_size=BASE_CONFIG['context_length'],
                eos_id=50256
        )
        generated_text = token_ids_to_text(token_ids, tokenizer)
        response_text = (
                generated_text[len(input_text):]
                .replace("### Response:", "")
                .strip()
        )
        test_data[i]["model_response"] = response_text
    
    with open("instruction-data-with-response.json", "w") as file:
        json.dump(test_data, file, indent=4)
    """

    章节7.3 交叉熵函数cross_entropy的ignore_index=-100

    import torch
    
    # 示例1: 输出逻辑值(logits)的每一维都对应着模型词汇表中的一个潜在词元
    logits_1 = torch.tensor(
        [[-1.0, 1.0],   # 第一个词元的预测
        [-0.5, 1.5]]     # 第二个词元的预测
    )
    targets_1 = torch.tensor([0, 1]) # 要生成的正确词元索引
    loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
    print(loss_1) # 输出: tensor(1.2201)
    
    # 示例2: 增加一个额外的词元会影响损失的计算
    logits_2 = torch.tensor(
        [[-1.0, 1.0],
        [-0.5, 1.5],
        [-0.5, 1.5]]    # 新的第三个词元的预测
    )
    targets_2 = torch.tensor([0, 1, 1])
    loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
    print(loss_2) # 输出: tensor(0.7936)
    
    
    # 示例3: 将第三个 目标词元ID替换为 -100
    targets_3 = torch.tensor([0, 1, -100])
    loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
    """
    与示例1计算的损失相同。即,交叉损失函数忽略了 target_3 向量中第三项 -100 所对应的损失。
    在PyTorch中,交叉熵函数的默认设置为 cross_entropy(..., ignore_index=-100) , 忽略标记为 -100 的目标。
    """
    print(loss_3) # 输出: tensor(1.1269) 
    
    

    在目标中保留结束符词元 ID 50256(<|endoftext|>),因为它有助于大语言模型学习生成结束符词元,从而在适当的时候结束回复。

    除了掩码填充词元,实践中通常还会掩码与指令相关的目标词元。通过掩码与指令对应的目标词元,交叉熵损失可以仅针对生成的回复目标词元进行计算。因此,模型的训练更专注于生成准确的回复,而非记住指令,这样可以帮助减少过拟合

    截至目前,研究人员对在指令微调过程中是否应掩码指令部分的损失仍存在分歧。例如,Shi 等人在 2024 年发表的论文“Instruction Tuning With Loss Over Instructions”中指出,不掩码指令可以提升大语言模型的性能。

    不使用参数量为 1.24 亿的最小的 GPT 模型,而是加载参数量为 3.55 亿的中等规模的 GPT 模型。这是因为参数量为 1.24 亿的模型容量过于有限,无法通过指令微调获得令人满意的结果。具体来说,较小的模型在学习高质量的指令遵循任务时,缺乏执行该任务所需的复杂模式和细微行为的能力

    章节7.8 使用更强大的模型评估微调后的大预言模型

    import urllib.request
    import json
    import requests
    from tqdm import tqdm
    
    
    # CUDA_VISIBLE_DEVICES=7 ollama serve
    # CUDA_VISIBLE_DEVICES=7 ollama run llama3:8b
    
    def query_model(
            prompt,
            model="llama3:8b",
            url="http://localhost:11434/api/chat"
    ):
        # 创建字典格式的数据
        data = {
                "model": model,
                "messages": [
                    {"role": "user", "content": prompt}
                ],
                "options": {
                    "seed": 123, # 设置种子得到确定性的返回结果
                    "temperature": 0,
                    "num_ctx": 2048
                }
        }
    
        # 将字典变成json格式的字符串,并编码为字节
        payload = json.dumps(data).encode("utf-8")
        # 创建一个请求对象,方法设置为 POST
        request = urllib.request.Request(
                url,
                data=payload,
                method="POST"
        )
        # 加入请求头
        request.add_header("Content-Type", "application/json")
    
        # 发送请求并捕获模型回复
        response_data = ""
        with urllib.request.urlopen(request) as response:
            while True:
                line = response.readline().decode("utf-8")
                if not line:
                    print("Error empty.", line)
                    break
                response_json = json.loads(line)
                response_data += response_json["message"]["content"]
        return response_data
    
    
    
    def query_model2(
        prompt, 
        model="llama3:8b", 
        url="http://localhost:11434/api/chat"):
        # Create the data payload as a dictionary
        data = {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "options": {     # Settings below are required for deterministic responses
                "seed": 123,
                "temperature": 0,
                "num_ctx": 2048
            }
        }
    
        # Send the POST request
        with requests.post(url, json=data, stream=True, timeout=30) as r:
            r.raise_for_status()
            response_data = ""
            for line in r.iter_lines(decode_unicode=True):
                if not line:
                    continue
                response_json = json.loads(line)
                if "message" in response_json:
                    response_data += response_json["message"]["content"]
    
        return response_data
    
    
    # result = query_model2("What do Llamas eat?")
    # print(result)
    
    
    
    # 使用提示词模版制作格式化数据
    # 将输入prompt 格式化为 指令-回复 模版
    def format_input(entry):
        instruction_text = (
                f"Below is an instruction that describes a task. "
                f"Write a response that appropriately completes the request."
                f"\n\n### Instruction:\n{entry['instruction']}"
        )
        input_text = (
                f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
        )
        return instruction_text + input_text
    
    
    file_path = "instruction-data-with-response.json"
    with open(file_path, "r") as file:
        test_data = json.load(file)
    
    """
    for entry in test_data[:3]:
            prompt = (
                    f"Given the input `{format_input(entry)}` "
                    f"and correct output `{entry['output']}`, "
                    f"score the model response `{entry['model_response']}`"
                    f" on a scale from 0 to 100, where 100 is the best score. "
            )
            # print("\nPrompt:", prompt)
            print("\nDataset response:")
            print(">>", entry["output"])
            print("\nModel response:")
            print(">>", entry["model_response"])
            print("\nScore:")
            print(">>", query_model(prompt))
            print("\n-------------------")
    """
    
    def generate_model_scores(json_data, json_key, model="llama3:8b"):
        scores = []
        for entry in tqdm(json_data, desc="Scoring entries"):
            prompt = (
                f"Given the input `{format_input(entry)}` "
                f"and correct output `{entry['output']}`, "
                f"score the model response `{entry[json_key]}`"
                f" on a scale from 0 to 100, where 100 is the best score. "
                f"Respond with the integet number only."
            )
            score = query_model(prompt, model=model)
            try:
                scores.append(int(score))
            except ValueError:
                print(f"Counlt not convert: {score}")
                continue
        return scores
    
    scores = generate_model_scores(test_data, "model_response")
    print(f"Number of scores: {len(scores)} of {len(test_data)}")
    print(f"Averate score: {sum(scores)/len(scores):.2f}\n")
    

    为了进一步提升模型的性能,也可以探索以下策略:

    1. 在微调过程中调整超参数,比如学习率、批次大小、训练轮数
    2. 增加训练数据集的规模或多样化的示例,以涵盖更广泛的话题和风格;
    3. 尝试不同的提示词或指令格式,以更有效地引导模型的回复
    4. 使用更大的预训练模型,以便更好地捕捉复杂模式并生成更准确的回复

    附录A PyTorch

    PyTorch的三大核心组件

    • 张量库。扩展了NumPy基于数组的编程功能,增加了GPU加速特性,实现了CPU和GPU之间的无缝计算切换。
    • 自动微分引擎(autograd)。能够自动计算张量操作的梯度,从而简化反向传播和模型优化。
    • 深度学习库。提供了模块化、灵活且高效的构建块(包括预训练模型、损失函数和优化器),从而轻松设计和训练各种深度学习模型。

    定义深度学习

    • 人工智能。基本目标是创建能够执行通常需要人类智能水平的任务的计算机系统,这些任务包括自然语言理解、模式识别和决策制定。
    • 机器学习。人工智能的一个子领域,专注于学习算法的开发和改进。主要理念是使计算机能够从数据中学习,并随着时间的推移通过更多数据和反馈提升性能的算法。
    • 深度学习。机器学习的一个子类别,专注于深度神经网络的训练和应用。这些深度神经网络最初受到人脑工作原理(特别是许多神经元之间的相互连接)的启发。深度学习中的深度指的是人工神经元或节点的多个隐藏层,这些层使他们能够对数据中的复杂非线性关系进行建模。

    理解张量

    • 标量(仅是一个数值)是秩为 0 的张量

    • 向量是秩为 1 的张量

    • 矩阵是秩为 2 的张量

    # 0维
    >>> torch.tensor(1)
    tensor(1)
    
    # 1维
    >>> torch.tensor([1,2,3])
    tensor([1, 2, 3])
    
    # 2维
    >>> torch.tensor([[1,2], [3,4]])
    tensor([[1, 2],
            [3, 4]])
    
    # 3维
    >>> torch.tensor([[[1,2],[3,4]], [[5,6],[7,8]]])
    tensor([[[1, 2],
             [3, 4]],
    
            [[5, 6],
             [7, 8]]])

    PyTorch张量类似于NumPy数组,但具有几个对深度学习至关重要的附加功能。(1)PyTorch添加了一个自动微分引擎,简化了梯度计算。(2)PyTorch张量还支持GPU计算,以加速深度神经网络的训练。

    将模型视为计算图 & 自动微分

    PyTorch的自动微分引擎(autograd),能够在动态计算图中自动计算梯度。

    计算图是一种有向图,主要用于表达和可视化数学表达式。在深度学习的背景下,计算图列出了计算神经网络输出所需要的计算顺序 ------ 我们需要用它来计算反向传播所需的梯度,这是神经网络的主要训练算法。

    import torch
    from urllib3 import request
    
    y = torch.tensor([1.0]) # 真实标签
    x1 = torch.tensor([1.1]) # 输入特征
    w1 = torch.tensor([2.2], requires_grad=True) # 权重参数
    b = torch.tensor([0.0], requires_grad=True) # 偏置单元
    
    # tensor([2.4200])
    z = x1 * w1 + b # 网络输入
    
    # tensor([0.9183])
    a = torch.sigmoid(z) # 激活和输出
    
    # tensor(0.0852)
    loss = torch.nn.functional.binary_cross_entropy(a, y) # 损失函数
    
    """
    手动使用 grad 函数
    """
    
    # (tensor([-0.0898]),)
    grad_L_w1 = torch.autograd.grad(loss, w1, retain_graph=True) # 计算梯度
    # (tensor([-0.0817]),)
    grad_L_b = torch.autograd.grad(loss, b, retain_graph=True)
    
    
    """
    对损失函数调用 .backward() 方法,PyTorch 将计算计算图中所有叶节点的梯度,
    这些梯度将通过张量的 .grad 属性进行存储
    """
    loss.backward()
    # tensor([-0.0898]) tensor([-0.0817])
    print(w1.grad, b.grad)
    
    

    在PyTroch中进行计算,只要其终端节点之一的 requires_grad 属性被设置为 True,PyTorch默认就会在内部构建一个计算图。

    在训练神经网络时,需要使用反向传播算法计算梯度。反向传播可以被视为微积分中链式法则在神经网络中的应用。

    偏导数:测量的是一个函数对于其中一个变量变化的速率。

    梯度:是一个向量,包含了  一个多变量函数(输入变量超过一个的函数)的所有偏导数。

    实现多层神经网络

    示例,是一个具有两个隐藏层的多层感知机(multi-layer perceptron),即全连接神经网络。

    import torch
    
    
    class NeuralNetWork(torch.nn.Module):
        def __init__(self, num_inputs, num_outputs):
            super().__init__()
            self.layers = torch.nn.Sequential(
                # 第一个隐藏层
                # 可训练参数包含在 torch.nn.Linear 层中。Linear层会将输入与权重矩阵相乘,并加上一个偏置向量。
                torch.nn.Linear(num_inputs, 30),
                torch.nn.ReLU(), # 非线性激活函数被放置在隐藏层之间
    
                # 第二个隐藏层
                # 一个隐藏层的输出节点数量必须与下一层的输入节点数量相匹配
                torch.nn.Linear(30, 20),
                torch.nn.ReLU(),
    
                # 输出层
                torch.nn.Linear(20, num_outputs)
            )
    
        # forward 方法描述了输入数据如何通过网络传递,并形成计算图。
        # backward 方法通常不需要我们自己实现,它在训练期间用于计算给定模型参数的损失函数的梯度
        def forward(self, x):
            # 最后一层的输出称为 logits
            logits = self.layers(x)
            return logits
    
    
    def neural_network():
        torch.manual_seed(123) # 为 PyTorch 的随机数生成器设定种子
    
        model = NeuralNetWork(50, 3)
        """
    NeuralNetWork(
      (layers): Sequential(
        (0): Linear(in_features=50, out_features=30, bias=True)
        (1): ReLU()
        (2): Linear(in_features=30, out_features=20, bias=True)
        (3): ReLU()
        (4): Linear(in_features=20, out_features=3, bias=True)
      )
    )
        """
        # print(model)
    
        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        # Total number of trainable parameters: 2213
        # weight: 30 * 50 + 20 * 30 + 3 * 20 = 2160
        # bias: 30 + 20 + 3 = 53
        print("Total number of trainable parameters:", num_params)
    
        # torch.Size([30, 50])
        # 注意 requires_grad=True. 意味着该矩阵是可训练的,这是 torch.nn.Linear 中权重和偏置的默认设置
        # print(model.layers[0].weight.shape)
    
        # torch.Size([30])
        # print(model.layers[0].bias.shape)
    
        torch.manual_seed(123)
        x = torch.rand(1, 50)
        out = model(x)
        # tensor([[-0.1262,  0.1080, -0.1792]], grad_fn=<AddmmBackward0>)
        # grad_fn=<AddmmBackward0> 表示计算图中用于计算某个变量的最后一个函数。
        # grad_fn=<AddmmBackward0> 意味着该张量是通过矩阵乘法和加法操作创建的。PyTorch会在反向传播期间使用这些信息来计算梯度。
        # 其中 <AddmmBackward0> 指定了执行的操作,执行的是一个 Addmm 操作。Addmm 代表的是矩阵乘法mm 后接加法Add 的组合运算。
        print(out)
    
    
        with torch.no_grad(): # 告诉 PyTorch 无需跟踪梯度,可以显著节省内存和计算资源
            out = model(x)
        # tensor([[-0.1262,  0.1080, -0.1792]])
        print(out)
    
        with torch.no_grad():
            out = torch.softmax(model(x), dim=1)
        # tensor([[0.3113, 0.3934, 0.2952]])
        print(out)
    
    
    if __name__ == '__main__':
        neural_network()

    数据加载器

    import torch
    from torch.utils.data import Dataset, DataLoader
    
    class ToyDataset(Dataset):
        def __init__(self, x, y):
            self.features = x
            self.labels = y
        def __getitem__(self, index):
            one_x = self.features[index]
            one_y = self.labels[index]
            return one_x, one_y
    
        def __len__(self):
            return self.labels.shape[0]
    
    
    def test_dataset():
        x_train = torch.tensor([
            [-1.2, 3.1],
            [-0.9, 2.9],
            [-0.5, 2.6],
            [2.3, -1.1],
            [2.7, -1.5],
        ])
        y_train = torch.tensor([0, 0, 0, 1, 1])
        x_test = torch.tensor([
            [-0.8, 2.8],
            [2.6, -1.6],
        ])
        y_test = torch.tensor([0, 1])
    
        train_ds = ToyDataset(x_train, y_train)
        test_ds = ToyDataset(x_test, y_test)
    
        torch.manual_seed(123)
        train_loader = DataLoader(
            dataset=train_ds,
            batch_size=2,
            shuffle=True, # 是否打乱数据
            # 当 num_workers 设置为 0 时,数据加载将在主进程而不是单独的工作进程中进行。
            num_workers=1, # 后台进程数量
            # 在实践中,如果一个训练轮次的最后一个批次显著小于其他批次,那么可能会影响训练过程中的收敛。
            drop_last=True, # 是否丢弃最后一个批次,以避免最后一个批次的大小不一致
        )
        test_loader = DataLoader(
            dataset=test_ds,
            batch_size=2,
            shuffle=False, # 测试数据集无须打乱顺序
            num_workers=0
        )
    
        for idx, (x, y) in enumerate(train_loader):
            print(f"Batch {idx+1}:", x, y)
    
    
    if __name__ == '__main__':
        test_dataset()

    典型的训练循环

    import torch
    from torch.utils.data import Dataset, DataLoader
    
    
    class NeuralNetWork(torch.nn.Module):
        def __init__(self, num_inputs, num_outputs):
            super().__init__()
            self.layers = torch.nn.Sequential(
                # 第一个隐藏层
                # 可训练参数包含在 torch.nn.Linear 层中。Linear层会将输入与权重矩阵相乘,并加上一个偏置向量。
                torch.nn.Linear(num_inputs, 30),
                torch.nn.ReLU(), # 非线性激活函数被放置在隐藏层之间
    
                # 第二个隐藏层
                # 一个隐藏层的输出节点数量必须与下一层的输入节点数量相匹配
                torch.nn.Linear(30, 20),
                torch.nn.ReLU(),
    
                # 输出层
                torch.nn.Linear(20, num_outputs)
            )
    
        # forward 方法描述了输入数据如何通过网络传递,并形成计算图。
        # backward 方法通常不需要我们自己实现,它在训练期间用于计算给定模型参数的损失函数的梯度
        def forward(self, x):
            # 最后一层的输出称为 logits
            logits = self.layers(x)
            return logits
    
    
    class ToyDataset(Dataset):
        def __init__(self, x, y):
            self.features = x
            self.labels = y
        def __getitem__(self, index):
            one_x = self.features[index]
            one_y = self.labels[index]
            return one_x, one_y
    
        def __len__(self):
            return self.labels.shape[0]
    
    
    # 计算预测准确率
    def compute_accuracy(model, dataloader):
        model = model.eval()
        correct = 0.0
        total_examples = 0
        for idx, (features, labels) in enumerate(dataloader):
            with torch.no_grad():
                logits = model(features)
            predictions = torch.argmax(logits, dim=1)
            compare = labels == predictions # 根据标签是否匹配,返回一个True/Fasle值的张量
            correct += torch.sum(compare)   # 求和操作,计算True值的数量
            total_examples += len(compare)
        return (correct / total_examples).item()
    
    def test_train():
        x_train = torch.tensor([
            [-1.2, 3.1],
            [-0.9, 2.9],
            [-0.5, 2.6],
            [2.3, -1.1],
            [2.7, -1.5],
        ])
        y_train = torch.tensor([0, 0, 0, 1, 1])
        x_test = torch.tensor([
            [-0.8, 2.8],
            [2.6, -1.6],
        ])
        y_test = torch.tensor([0, 1])
    
        train_ds = ToyDataset(x_train, y_train)
    
        torch.manual_seed(123)
        train_loader = DataLoader(
            dataset=train_ds,
            batch_size=2,
            shuffle=True, # 是否打乱数据
            # 当 num_workers 设置为 0 时,数据加载将在主进程而不是单独的工作进程中进行。
            num_workers=1, # 后台进程数量
            # 在实践中,如果一个训练轮次的最后一个批次显著小于其他批次,那么可能会影响训练过程中的收敛。
            drop_last=True, # 是否丢弃最后一个批次,以避免最后一个批次的大小不一致
        )
    
    
        torch.manual_seed(123)
        model = NeuralNetWork(num_inputs=2, num_outputs=2)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        model = model.to(device)
        # 使用了一个学习率lr为0.5的随机梯度下降SGD优化器
        optimizer = torch.optim.SGD(
            model.parameters(), lr=0.5 # 优化器需要知道哪些参数需要优化
        )
        num_epochs = 3
        for epoch in range(num_epochs):
            model.train()
            for batch_idx, (features, labels) in enumerate(train_loader):
                features, labels = features.to(device), labels.to(device)
                logits = model(features)
                # 直接将 logits 传递给 cross_entropy 损失函数,后者会在内部 应用 softmax 函数,以提高效率并增强数值稳定性。
                loss = torch.nn.functional.cross_entropy(logits, labels)
                optimizer.zero_grad() # 将上一轮的梯度置0, 以防止意外的梯度累积
                # 计算由 PyTorch 在后台构建的计算图中的梯度。
                loss.backward() # 根据模型参数计算损失的梯度
                # 利用这些梯度来更新模型参 数以最小化损失。
                # 对 SGD 优化器而言,这意味着将梯度与学习率相乘,然后将缩放后的 负梯度 加到参数上。
                optimizer.step() # 优化器使用梯度更新模型参数
    
                print(f"Epoch: {epoch+1:03d}/{num_epochs:03d}"
                f" | Batch {batch_idx:03d}/{len(train_loader):03d}"
                f" | Train Loss: {loss:.2f}")
    
            model.eval()
            # 插入可选的模型评估代码
    
    
        # 在训练好模型后,可以使用它进行预测:
        model.eval()
        with torch.no_grad():
            outputs = model(x_train)
        print(outputs)
        torch.set_printoptions(sci_mode=False) # 为了易读,不使用科学计数
        probas = torch.softmax(outputs, dim=1)
        print(probas)
        # predictions = torch.argmax(probas, dim=1)
        predictions = torch.argmax(outputs, dim=1)
        print(predictions)
    
    
        print(compute_accuracy(model, train_loader))
    
    if __name__ == '__main__':
    
        test_train()

    保存和加载模型

    
        # 保存模型
        torch.save(model.state_dict(), "model.pth")
    
        # 加载模型
        model = NeuralNetWork(num_inputs=2,num_outputs=2)
        model.load_state_dict(torch.load("model.pth"))

    使用多个GPU训练

    def ddp_setup(rank, world_size):
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "12345"
    
        init_process_group(
            backend="nccl",
            rank=rank,
            world_size=world_size
        )
        torch.cuda.set_device(rank)
    
    def prepare_dataset():
        x_train = torch.tensor([
            [-1.2, 3.1],
            [-0.9, 2.9],
            [-0.5, 2.6],
            [2.3, -1.1],
            [2.7, -1.5],
        ])
        y_train = torch.tensor([0, 0, 0, 1, 1])
        x_test = torch.tensor([
            [-0.8, 2.8],
            [2.6, -1.6],
        ])
        y_test = torch.tensor([0, 1])
    
        train_ds = ToyDataset(x_train, y_train)
    
        train_loader = DataLoader(
            dataset=train_ds,
            batch_size=2,
            shuffle=False,
            pin_memory=True,
            drop_last=True,
            # DistributedSampler 负责打乱数据
            # 将数据集分割成不同且 不重叠的子集,以供每个 进程(GPU)使用
            sampler=DistributedSampler(train_ds) 
        )
        test_loader = None
        return train_loader, test_loader
    
    def test_distributed(rank, world_size, num_epochs):
        ddp_setup(rank, world_size) # 设置分布式环境
        train_loader, test_loader = prepare_dataset() # 加载训练集和测试集
        model = NeuralNetWork(num_inputs=2, num_outputs=2) # 设置模型
        model.to(rank) # rank用于指代GPU设备ID
        optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
        model = DDP(model, device_ids=[rank])
        for epoch in range(num_epochs):
            for features, labels in train_loader:
                features, labels = features.to(rank), labels.to(rank)
                logits = model(features)
                # 直接将 logits 传递给 cross_entropy 损失函数,后者会在内部 应用 softmax 函数,以提高效率并增强数值稳定性。
                loss = torch.nn.functional.cross_entropy(logits, labels)
                optimizer.zero_grad() # 将上一轮的梯度置0, 以防止意外的梯度累积
                # 计算由 PyTorch 在后台构建的计算图中的梯度。
                loss.backward() # 根据模型参数计算损失的梯度
                # 利用这些梯度来更新模型参 数以最小化损失。
                # 对 SGD 优化器而言,这意味着将梯度与学习率相乘,然后将缩放后的 负梯度 加到参数上。
                optimizer.step() # 优化器使用梯度更新模型参数
    
                print(f"[GPU{rank}] Epoch: {epoch+1:03d}/{num_epochs:03d}"
                f" | Batchsize {labels.shape[0]:03d}"
                f" | Train/Val Loss: {loss:.2f}")
        model.eval()
        destroy_process_group() # 清理资源分配
    
    
    # CUDA_VISIBLE_DEVICES=0,2 python some_script.py
    if __name__ == '__main__':
    
        torch.manual_seed(123)
        num_epochs = 3
        world_size = torch.cuda.device_count()
        # mp.spawn 生成新进程
        # 使用多个进程启动,其中 nprocs=world_size 意味着spawn为每个GPU启动一个进程
        # test_distributed 的rank入参会自动传递
        mp.spawn(test_distributed, args=(world_size, num_epochs), nprocs=world_size)
    
    
    """
    [GPU0] Epoch: 001/003 | Batchsize 002 | Train/Val Loss: 0.62
    [GPU1] Epoch: 001/003 | Batchsize 002 | Train/Val Loss: 0.64
    [GPU0] Epoch: 002/003 | Batchsize 002 | Train/Val Loss: 0.22
    [GPU1] Epoch: 002/003 | Batchsize 002 | Train/Val Loss: 0.24
    [GPU0] Epoch: 003/003 | Batchsize 002 | Train/Val Loss: 0.07
    [GPU1] Epoch: 003/003 | Batchsize 002 | Train/Val Loss: 0.08
    """

    NanoGPT 是一个提供简约而高效的 GPT-2 模型实现的代码库。  “NanoGPT, a Repository for Training Medium-Sized GPTs”。

    附录D 改进训练循环

    学习旅预热

    引入了学习率预热(learning rate warmup)、余弦衰减(cosine decay)、梯度裁剪(gradient clipping) 等技术。

    学习率预热可以帮助稳定复杂模型(如大语言模型)的训练过程。这个过程可以逐步将学习 率从一个非常低的初始值(initial_lr)提升到用户设定的最大值(peak_lr)。在训练开始 时使用较小的权重更新,有助于降低模型在训练过程中遭遇大幅度、不稳定更新的风险。

    optimizer = torch.optim.AdamW(model.parameters(), weight_decay=0.1) 
    # 这一增量取决于在 预热步骤中每一步 对 initial_lr 的 增加量
    lr_increment = (peak_lr - initial_lr) / warmup_steps
    global_step = -1
    track_lrs = []
    
    for epoch in range(n_epochs):
        for input_batch, target_batch in train_loader:
    
            optimizer.zero_grad()
            global_step += 1
            # 如果还在预热阶段,就更新学习率
            if global_step < warmup_steps:
                lr = initial_lr + global_step * lr_increment
            else:
                lr = peak_lr
            # 将计算后的学习率应用到优化器上
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr
            track_lrs.append(optimizer.param_groups[0]["lr"])

    余弦衰减

    余弦衰减,在训练过程中可以调节学习率,使其在预热阶段后呈现余弦曲线的变化。

    在其流行的变体中,余弦衰减可以将学习率降低到接近零,模拟半个余弦周期的轨迹。学习率的逐渐降低旨在减缓模型更新权重的速度。这一点特别重要,因为它有助于降低训练过程中超过损失最小值的风险,从而确保后期训练的稳定性。

    import math
    min_lr = 0.1 * initial_lr
    track_lrs = []
    lr_increment = (peak_lr - initial_lr) / warmup_steps
    global_step = -1
    for epoch in range(n_epochs):
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            global_step += 1
            if global_step < warmup_steps:
                # 使用线性预热
                lr = initial_lr + global_step * lr_increment
            else:
                # 在预热后使用余弦衰减
                progress = ((global_step - warmup_steps) /
                            (total_training_steps - warmup_steps))
                lr = min_lr + (peak_lr - min_lr) * 0.5 * (
                    1 + math.cos(math.pi * progress)
    
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr
            track_lrs.append(optimizer.param_groups[0]["lr"])

    梯度裁剪

    梯度裁剪也是增强大语言模型训练稳定性的一种重要技术。该方法涉及设定一个阈值,超过该阈值的梯度会被缩放到预定的最大值。这种做法可以确保在反向传播过程中,对模型参数的更新保持在一个可控的范围内。

    # max_norm=1.0,限制梯度的范数不超过 1.0。
    # “范数”是指梯度向量在模型参数空间内的长度或大小,特别指的是 L2 范数, 即欧几里得范数。
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    改进的训练函数

    def train_model(model, train_loader, val_loader, optimizer, device,
                    n_epochs, eval_freq, eval_iter, start_context, tokenizer,
                    warmup_steps, initial_lr=3e-05, min_lr=1e-6):
    
        train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], []
        tokens_seen, global_step = 0, -1
    
        # Retrieve the maximum learning rate from the optimizer
        # 从优化器中检索出最初的学习率,假设使用它作为学习率的最大值
        peak_lr = optimizer.param_groups[0]["lr"]
    
        # Calculate the total number of iterations in the training process
        # 计算训练过程中所有的迭代步数
        total_training_steps = len(train_loader) * n_epochs
    
        # Calculate the learning rate increment during the warmup phase
        # 计算在预热阶段学习率的增量
        lr_increment = (peak_lr - initial_lr) / warmup_steps
    
        for epoch in range(n_epochs):
            model.train()
            for input_batch, target_batch in train_loader:
                optimizer.zero_grad()
                global_step += 1
    
                # Adjust the learning rate based on the current phase (warmup or cosine annealing)
                # 调整学习率:预热或余弦衰减
                if global_step < warmup_steps:
                    # Linear warmup
                    lr = initial_lr + global_step * lr_increment  
                else:
                    # Cosine annealing after warmup
                    progress = ((global_step - warmup_steps) / 
                                (total_training_steps - warmup_steps))
                    lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))
    
                # Apply the calculated learning rate to the optimizer
                # 在优化器上应用计算后的学习率
                for param_group in optimizer.param_groups:
                    param_group["lr"] = lr
                track_lrs.append(lr)  # Store the current learning rate
    
                # Calculate and backpropagate the loss
                loss = calc_loss_batch(input_batch, target_batch, model, device)
                loss.backward()
    
                # Apply gradient clipping after the warmup phase to avoid exploding gradients
                # 在预热阶段后使用梯度裁剪来避免梯度爆炸
                if ORIG_BOOK_VERSION:
                    if global_step > warmup_steps:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  
                else:
                    if global_step >= warmup_steps:  # the book originally used global_step > warmup_steps, which led to a skipped clipping step after warmup
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    
                optimizer.step()
                tokens_seen += input_batch.numel()
    
                # Periodically evaluate the model on the training and validation sets
                if global_step % eval_freq == 0:
                    train_loss, val_loss = evaluate_model(
                        model, train_loader, val_loader,
                        device, eval_iter
                    )
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                    track_tokens_seen.append(tokens_seen)
                    # Print the current losses
                    print(f"Ep {epoch+1} (Iter {global_step:06d}): "
                          f"Train loss {train_loss:.3f}, "
                          f"Val loss {val_loss:.3f}"
                    )
    
            # Generate and print a sample from the model to monitor progress
            generate_and_print_sample(
                model, tokenizer, device, start_context
            )
    
        return train_losses, val_losses, track_tokens_seen, track_lrs

    附录E 使用LoRA进行高效参数微调

    LoRA 低秩自适应,是应用最广泛的参数高效微调技术之一。

    LoRA 是一种通过仅调整模型权重参数的一小部分,使预训练模型更好地适应特定且通常较小的数据集的技术。“低秩”指的是将模型调整限制在总权重参数空间的较小维度子空间,从而有效捕获训练过程中对权重参数变化影响最大的方向。LoRA 方法之所以有用且广受欢迎,是因为它能够高效地对大模型进行特定任务的微调,显著降低了通常所需的计算成本和资源。

    假设一个大型权重矩阵 W 与特定层相关联,LoRA 可以应用于大语言模型中的所有线性层。

    import math
    import torch
    from torch.utils.data import Dataset, DataLoader
    import tiktoken
    import pandas as pd
    from gpt_download import download_and_load_gpt2
    from previous_chapters_for_appendixE import GPTModel, load_weights_into_gpt, text_to_token_ids, token_ids_to_text, generate_text_simple
    
    class LoRALayer(torch.nn.Module):
            def __init__(self, in_dim, out_dim, rank, alpha):
                    super().__init__()
                    # 初始化LoRA矩阵A和B,近似于权重更新矩阵 
                    # rank(r) 内部维度r为超参数,可通过改变A和B的大小来调整可训练参数的数量
                    # rank控制着矩阵A和B的内部维度,决定了LoRA引入的额外参数量,在模型的适应性和效率之间建立平衡
                    self.A = torch.nn.Parameter(torch.empty(in_dim, rank))
                    torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
    
                    self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
                    # alpha 作为低秩自适应输出的缩放因子,主要决定了适应层
                    self.alpha = alpha # 设置alpha缩放因袭
    
            def forward(self, x):
                    x = self.alpha * (x @ self.A @ self.B)
                    return x
    
    class LinearWithLoRA(torch.nn.Module):
            def __init__(self, linear, rank, alpha):
                    super().__init__()
                    self.linear = linear
                    self.lora = LoRALayer(
                            linear.in_features, linear.out_features, rank, alpha
                    )
            def forward(self, x):
                    # 权重矩阵B (LoRALayer中的self.B)被初始化为零值,矩阵A和B的乘积为零矩阵。
                    # 因此这里加零不会改变原始权重
                    return self.linear(x) + self.lora(x)
    
    def replace_linear_with_lora(model, rank, alpha):
            for name, module in model.named_children():
                    if isinstance(module, torch.nn.Linear):
                    # 使用 LinearWithLoRA层替换Linear层
                        setattr(
                                model,
                                name,
                                LinearWithLoRA(module, rank, alpha)
                        )
                    else:
                    # 递归
                        replace_linear_with_lora(module, rank, alpha)
    
    class SpamDataset(Dataset):
        def __init__(self, csv_file, tokenizer, max_length=None,
                pad_token_id=50256):
            self.data = pd.read_csv(csv_file)
    
            self.encoded_texts = [
                    tokenizer.encode(text) for text in self.data["Text"]
            ] # 文本分词
    
            if max_length is None:
                self.max_length = self._longest_encoded_length()
            else:
                self.max_length = max_length
    
                self.encoded_texts = [
                        encoded_text[:self.max_length]
                        for encoded_text in self.encoded_texts
                ] # 如果序列长度超过 max_length, 则进行截断
    
            self.encoded_texts = [
                    encoded_text + [pad_token_id] * 
                    (self.max_length - len(encoded_text))
                    for encoded_text in self.encoded_texts
            ] # 填充到最长序列的长度
    
        def __getitem__(self, index):
            encoded = self.encoded_texts[index]
            label = self.data.iloc[index]["Label"]
            return (
                    torch.tensor(encoded, dtype=torch.long),
                    torch.tensor(label, dtype=torch.long)
            )
    
        def __len__(self):
            return len(self.data)
    
        def _longest_encoded_length(self):
            max_length = 0
            for encoded_text in self.encoded_texts:
                encoded_length = len(encoded_text)
                if encoded_length > max_length:
                    max_length = encoded_length
            return max_length
    
    def calc_accuracy_loader(data_loader, model, device, num_batches=None):
        model.eval()
        correct_predictions, num_examples = 0, 0
    
        if num_batches is None:
            num_batches = len(data_loader)
        else:
            num_batches = min(num_batches, len(data_loader))
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i < num_batches:
                input_batch = input_batch.to(device)
                target_batch = target_batch.to(device)
    
                with torch.no_grad():
                    logits = model(input_batch)[:, -1, :]
                predicted_labels = torch.argmax(logits, dim=-1)
                num_examples += predicted_labels.shape[0]
                correct_predictions += (
                        (predicted_labels == target_batch).sum().item()
                )
            else:
                break
        return correct_predictions / num_examples
    
    def calc_loss_batch(input_batch, target_batch, model, device):
        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)
        logits = model(input_batch)[:, -1, :]
        loss = torch.nn.functional.cross_entropy(logits, target_batch)
        #print("shape:", logits.shape, target_batch.shape, loss.shape, loss)
        return loss
    
    def calc_loss_loader(data_loader, model, device, num_batches=None):
        total_loss = 0.
        if len(data_loader) == 0:
            return float("nan")
        elif num_batches is None:
            num_batches = len(data_loader)
        else:
            num_batches = min(num_batches, len(data_loader))
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i < num_batches:
                loss = calc_loss_batch(
                        input_batch, target_batch, model, device
                )
                total_loss += loss.item()
            else:
                break
        return total_loss / num_batches
    
    def train_classifier_simple(
            model, train_loader, val_loader, optimizer, device,
            num_epochs, eval_freq, eval_iter):
        train_losses, val_losses, train_accs, val_accs = [], [], [], []
        examples_seen, global_step = 0, -1
        for epoch in range(num_epochs): # 主训练循环
            model.train() # 设置模型为训练模式
            for input_batch, target_batch in train_loader:
                optimizer.zero_grad() # 重置上一次批次迭代的损失梯度
                loss = calc_loss_batch(
                        input_batch, target_batch, model, device
                )
                loss.backward() # 反向传播以计算损失梯度
                optimizer.step() # 使用损失梯度更新权重
                examples_seen += input_batch.shape[0]
                global_step += 1
    
                if global_step % eval_freq == 0: # 可选的评估步骤
                    train_loss, val_loss = evaluate_model(
                            model, train_loader, val_loader, device, eval_iter)
                    train_losses.append(train_loss)
                    val_losses.append(val_loss)
                    print(f"Ep {epoch+1} (Step {global_step:06d}): "
                            f"Train loss {train_loss:.3f}, "
                            f"Val loss {val_loss:.3f}"
                    )
            train_accuracy = calc_accuracy_loader( # 每轮训练后计算准确率
                    train_loader, model, device, num_batches=eval_iter
            )
            val_accuracy = calc_accuracy_loader(
                    val_loader, model, device, num_batches=eval_iter
            )
            print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
            print(f"Validation accuracy: {val_accuracy*100:.2f}%")
            train_accs.append(train_accuracy)
            val_accs.append(val_accuracy)
        return train_losses, val_losses, train_accs, val_accs, examples_seen
    
    def evaluate_model(model, train_loader, val_loader, device, eval_iter):
        model.eval()
        with torch.no_grad():
            train_loss = calc_loss_loader(
                    train_loader, model, device, num_batches=eval_iter
            )
            val_loss = calc_loss_loader(
                    val_loader, model, device, num_batches=eval_iter
            )
        model.train()
        return train_loss, val_loss
    
    def classify_review(
            text, model, tokenizer, device, max_length=None,
            pad_token_id = 50256):
        model.eval()
    
        # 准备模型的输入数据
        input_ids = tokenizer.encode(text)
        supported_context_length = model.pos_emb.weight.shape[1]
        input_ids = input_ids[:min(
            max_length, supported_context_length
        )] # 截断过长的序列
    
        input_ids += [pad_token_id] * (max_length - len(input_ids))
        input_tensor = torch.tensor(
                input_ids, device=device
        ).unsqueeze(0) # 添加批次维度
        with torch.no_grad(): # 推理时不需要计算梯度
            logits = model(input_tensor)[:, -1, :] # 最后一个输出词元的logits
        predicted_label = torch.argmax(logits, dim=-1).item()
        return "spam" if predicted_label == 1 else "not spam"
    
    
    tokenizer = tiktoken.get_encoding("gpt2")
    
    train_dataset = SpamDataset(
            csv_file="train.csv",
            max_length=None,
            tokenizer=tokenizer
    )
    print(train_dataset.max_length)
    val_dataset = SpamDataset(
            csv_file="validation.csv",
            max_length=train_dataset.max_length,
            tokenizer=tokenizer
    )
    test_dataset = SpamDataset(
            csv_file="test.csv",
            max_length=train_dataset.max_length,
            tokenizer=tokenizer
    )
    print(f"train {len(train_dataset)}, validation {len(val_dataset)}, test {len(test_dataset)}")
    
    num_workers = 0
    batch_size = 8
    torch.manual_seed(123)
    train_loader = DataLoader(
            dataset=train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=num_workers,
            drop_last=True,
    )
    val_loader = DataLoader(
            dataset=val_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            drop_last=False,
    )
    test_loader = DataLoader(
            dataset=test_dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            drop_last=False,
    )
    
    """
    for input_batch, target_batch in train_loader:
        pass
    print("Input batch:", input_batch.shape)
    print("Label batch:", target_batch.shape)
    print(f"train {len(train_loader)}, validation {len(val_loader)}, test {len(test_loader)}")
    """
    
    CHOOSE_MODEL = "gpt2-small-124M"
    INPUT_PROMPT = "Every effort moves"
    BASE_CONFIG = {
            "vocab_size": 50257,
            "context_length": 1024,
            "drop_rate": 0.0,
            "qkv_bias": True
    }
    model_configs = {
            "gpt2-small-124M": {"emb_dim":768, "n_layers":12, "n_heads":12},
            "gpt2-medium-355M": {"emb_dim":1024, "n_layers":24, "n_heads":16},
            "gpt2-large-774M": {"emb_dim":1280, "n_layers":36, "n_heads":20},
            "gpt2-xl-1558M": {"emb_dim":1600, "n_layers":48, "n_heads":25},
    }
    BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
    
    model_size = CHOOSE_MODEL.split("-")[-1]
    settings, params = download_and_load_gpt2(
            model_size=model_size, models_dir="gpt2"
    )
    model = GPTModel(BASE_CONFIG)
    load_weights_into_gpt(model, params)
    
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    print(model)
    
    text_1 = "Every effort moves you"
    token_ids = generate_text_simple(
            model=model,
            idx=text_to_token_ids(text_1, tokenizer).to(device),
            max_new_tokens=15,
            context_size=BASE_CONFIG["context_length"])
    print(token_ids_to_text(token_ids, tokenizer))
    
    text_2 = (
            "Is the following text 'spam'? Answer with 'yes' or 'no':"
            " 'You are a winner you have been specially"
            " selected to receive $1000 cash or a $2000 award.'"
    )
    token_ids = generate_text_simple(
        model=model,
        idx=text_to_token_ids(text_2, tokenizer).to(device),
        max_new_tokens=23,
        context_size=BASE_CONFIG["context_length"]
    )
    print(token_ids.shape)
    print(token_ids_to_text(token_ids, tokenizer))
    """
    
    
    torch.manual_seed(123)
    num_classes = 2
    # 这个新的 model.out_head 输出层的 requires_grad 属性默认设置为 True,
    # 这意味着它是模型中唯一在训练过程中会被更新的层。
    model.out_head = torch.nn.Linear(
            in_features=BASE_CONFIG["emb_dim"],
            out_features=num_classes
    )
    
    # 实验中发现,微调额外的层可以显著提升模型的预测性能。
    # 我们还将最后一个 Transformer 块和连接该块到输出层的最终层归一化模块设置为可训练
    """
    for param in model.trf_blocks[-1].parameters():
        param.requires_grad = True
    for param in model.final_norm.parameters():
        param.requires_grad = True
    """
    
    # 为了使模型准备好进行分类微调,首先冻结模型,即将所有层设为不可训练。
    # 在使用 LinearWithLoRA 层升级之前,先冻结原始模型的参数
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters before: {total_params:,}")
    for param in model.parameters():
        param.requires_grad = False
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters after: {total_params:,}")
    
    # 替换原来的 Linear 层。通常选择将 alpha 设置为rank的一半、两倍或等于rank的值
    replace_linear_with_lora(model, rank=16, alpha=16)
    # 使用 LoRA 后,将可训练参数的数量减少了
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable LoRA parameters: {total_params:,}")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    # print(model)
    
    """
    torch.manual_seed(123)
    train_accuracy = calc_accuracy_loader(
            train_loader, model, device, num_batches=10
    )
    val_accuracy = calc_accuracy_loader(
            val_loader, model, device, num_batches=10
    )
    test_accuracy = calc_accuracy_loader(
            test_loader, model, device, num_batches=10
    )
    
    with torch.no_grad():
        train_loss = calc_loss_loader(
                train_loader, model, device, num_batches=5
        )
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)
        test_loss = calc_loss_loader(test_loader, model, device, num_batches=5)
    print(f"Training accuracy: {train_accuracy*100:.2f}% loss: {train_loss:.3f}")
    print(f"Validation accuracy: {val_accuracy*100:.2f}% loss: {val_loss:.3f}")
    print(f"Test accuracy: {test_accuracy*100:.2f}% loss: {test_loss:.3f}")
    """
    
    
    import time
    start_time = time.time()
    torch.manual_seed(123)
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
    num_epochs = 5
    train_losses, val_losses, train_accs, val_accs, examples_seen = \
            train_classifier_simple(
                    model, train_loader, val_loader, optimizer, device,
                    num_epochs=num_epochs, eval_freq=50, eval_iter=5
            )
    end_time = time.time()
    execution_time_minutes = (end_time -start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")
    
    
    import matplotlib.pyplot as plt
    def plot_values(
            epochs_seen, examples_seen, train_values, val_values,
            label="loss"):
        # 绘制训练集损失和验证集损失与轮数的关联
        fig, ax1 = plt.subplots(figsize=(5, 3))
        ax1.plot(epochs_seen, train_values, label=f"Training {label}")
        ax1.plot(
                epochs_seen, val_values, linestyle="-.",
                label=f"Validation {label}"
        )
        ax1.set_xlabel("Epochs")
        ax1.set_ylabel(label.capitalize())
        ax1.legend()
    
        ax2 = ax1.twiny() # 为所见样本创建第二个x轴
        ax2.plot(examples_seen, train_values, alpha=0) # 不可见的图形用于对齐刻度
        ax2.set_xlabel("Examples seen")
    
        fig.tight_layout() # 调整布局以腾出空间
        plt.savefig(f"finetune-classify-LoRA-{label}-plot")
        plt.show()
    
    # 绘制分类微调的损失曲线
    epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
    examples_seen_tensor = torch.linspace(0, examples_seen, len(train_losses))
    plot_values(epochs_tensor, examples_seen_tensor, train_losses, val_losses)
    
    # 绘制分类的准确率图表
    epochs_tensor = torch.linspace(0, num_epochs, len(train_accs))
    examples_seen_tensor = torch.linspace(0, examples_seen, len(train_accs))
    plot_values(
            epochs_tensor, examples_seen_tensor, train_accs, val_accs, 
            label="accuracy"
    )
    
    # 计算整个数据集在训练集、验证集和测试集上的性能指标
    train_accuracy = calc_accuracy_loader(train_loader, model, device)
    val_accuracy = calc_accuracy_loader(val_loader, model, device)
    test_accuracy = calc_accuracy_loader(test_loader, model, device)
    print(f"Training accuracy: {train_accuracy*100:.2f}%")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    print(f"Test accuracy: {test_accuracy*100:.2f}%")
    

    模型结构

    GPTModel(
      (tok_emb): Embedding(50257, 768)
      (pos_emb): Embedding(1024, 768)
      (drop_emb): Dropout(p=0.0, inplace=False)
      (trf_blocks): Sequential(
        (0): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (1): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (2): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (3): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (4): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (5): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (6): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (7): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (8): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (9): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (10): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
        (11): TransformerBlock(
          (att): MultiHeadAttention(
            (W_query): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_key): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (W_value): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_proj): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (ff): FeedForward(
            (layers): Sequential(
              (0): LinearWithLoRA(
                (linear): Linear(in_features=768, out_features=3072, bias=True)
                (lora): LoRALayer()
              )
              (1): GELU()
              (2): LinearWithLoRA(
                (linear): Linear(in_features=3072, out_features=768, bias=True)
                (lora): LoRALayer()
              )
            )
          )
          (norm1): LayerNorm()
          (norm2): LayerNorm()
          (drop_resid): Dropout(p=0.0, inplace=False)
        )
      )
      (final_norm): LayerNorm()
      (out_head): LinearWithLoRA(
        (linear): Linear(in_features=768, out_features=2, bias=True)
        (lora): LoRALayer()
      )
    )

    输出

    Ep 1 (Step 000000): Train loss 3.651, Val loss 3.300
    Ep 1 (Step 000050): Train loss 0.275, Val loss 0.259
    Ep 1 (Step 000100): Train loss 0.266, Val loss 0.488
    Training accuracy: 97.50% | Validation accuracy: 97.50%
    Ep 2 (Step 000150): Train loss 0.207, Val loss 0.043
    Ep 2 (Step 000200): Train loss 0.011, Val loss 0.165
    Ep 2 (Step 000250): Train loss 0.050, Val loss 0.049
    Training accuracy: 97.50% | Validation accuracy: 92.50%
    Ep 3 (Step 000300): Train loss 0.092, Val loss 0.011
    Ep 3 (Step 000350): Train loss 0.123, Val loss 0.341
    Training accuracy: 95.00% | Validation accuracy: 95.00%
    Ep 4 (Step 000400): Train loss 0.017, Val loss 0.083
    Ep 4 (Step 000450): Train loss 0.031, Val loss 0.113
    Ep 4 (Step 000500): Train loss 0.005, Val loss 0.069
    Training accuracy: 100.00% | Validation accuracy: 95.00%
    Ep 5 (Step 000550): Train loss 0.075, Val loss 0.088
    Ep 5 (Step 000600): Train loss 0.086, Val loss 0.175
    Training accuracy: 100.00% | Validation accuracy: 100.00%
    Training completed in 0.44 minutes.
    
    Training accuracy: 99.81%
    Validation accuracy: 99.33%
    Test accuracy: 97.33%

    Logo

    更多推荐