Chapter-4 微调大模型

1 环境配置

安装的第三方库如下:

!pip install -q datasets pandas peft

2 构造微调数据集

因此,当我们进行 LLM SFT 以提升 LLM 在指定下游任务的表现时,我们需要将训练数据构造成上述格式,并对数据集进行处理来支持模型微调。接下来,我们以角色扮演任务(要求 LLM 扮演甄嬛,以甄嬛的语气、风格与用户对话)为例,演示如何进行微调数据集构造。

我们使用从《甄嬛传》剧本提取出来的对话作为基准来构建训练数据集。提取出来的对话包括上述格式的键:

  • instruction:即对话的上文;
  • input:此处置为空;
  • output:即甄嬛的回复。

微调数据集可以在此处下载:https://github.com/KMnO4-zx/huanhuan-chat/blob/master/dataset/train/lora/huanhuan.json

# 加载第三方库
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

# 将JSON文件转换为CSV文件
df = pd.read_json('./huanhuan.json') # 注意修改
ds = Dataset.from_pandas(df)

ds[0]

显示如下:

查看一下 Qwen-3-4B 的指令格式:

# 加载模型 tokenizer 
tokenizer = AutoTokenizer.from_pretrained('/root/autodl-tmp/model/Qwen/Qwen3-4B-Thinking-2507', trust_remote_code=True)

# 打印一下 chat template
messages = [
    {"role": "system", "content": "===system_message_test==="},
    {"role": "user", "content": "===user_message_test==="},
    {"role": "assistant", "content": "===assistant_message_test==="},
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True
)
print(text)

显示如下:

<|im_start|>system
===system_message_test===<|im_end|>
<|im_start|>user
===user_message_test===<|im_end|>
<|im_start|>assistant
<think>

</think>

===assistant_message_test===<|im_end|>
<|im_start|>assistant
<think>

接着,我们基于上文打印的指令格式,完成数据集处理函数:

def process_func(example):
    MAX_LENGTH = 1024 # 设置最大序列长度为1024个token
    input_ids, attention_mask, labels = [], [], [] # 初始化返回值
    # 适配chat_template
    instruction = tokenizer(
        f"<s><|im_start|>system\n现在你要扮演皇帝身边的女人--甄嬛<|im_end|>\n" 
        f"<|im_start|>user\n{example['instruction'] + example['input']}<|im_end|>\n"  
        f"<|im_start|>assistant\n<think>\n\n</think>\n\n",  
        add_special_tokens=False   
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    # 将instructio部分和response部分的input_ids拼接,并在末尾添加eos token作为标记结束的token
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    # 注意力掩码,表示模型需要关注的位置
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    # 对于instruction,使用-100表示这些位置不计算loss(即模型不需要预测这部分)
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]  
    if len(input_ids) > MAX_LENGTH:  # 超出最大序列长度截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }
# 使用上文定义的函数对数据集进行处理
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
tokenized_id

显示如下:

Map: 100%

 3729/3729 [00:01<00:00, 3272.39 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3729
})
# 最终模型的输入
print(tokenizer.decode(tokenized_id[0]['input_ids']))
<s><|im_start|>system
现在你要扮演皇帝身边的女人--甄嬛<|im_end|>
<|im_start|>user
小姐,别的秀女都在求中选,唯有咱们小姐想被撂牌子,菩萨一定记得真真儿的——<|im_end|>
<|im_start|>assistant
<think>

</think>

嘘——都说许愿说破是不灵的。<|endoftext|>
# 最终模型的输出
print(tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[0]["labels"]))))
嘘——都说许愿说破是不灵的。<|endoftext|>

3 高效微调-LoRA

如何进行 LoRA 微调

我们使用 peft 库来高效、便捷地实现 LoRA 微调。

# 首先配置 LoRA 参数
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # 任务类型为 CLM,即 SFT 任务的类型
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # 目标模块,即需要进行 LoRA 微调的模块
    inference_mode=False, # 训练模式
    r=8, # Lora 秩,即 LoRA 微调的维度
    lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理
    lora_dropout=0.1# Dropout 比例
)
config
LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'k_proj', 'o_proj', 'down_proj', 'q_proj', 'up_proj', 'v_proj', 'gate_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)
import torch
# 加载基座模型
model = AutoModelForCausalLM.from_pretrained('/root/autodl-tmp/model/Qwen/Qwen3-4B-Thinking-2507', device_map="auto",torch_dtype=torch.bfloat16)
# 开启模型梯度检查点能降低训练的显存占用
model.enable_input_require_grads()
# 通过下列代码即可向模型中添加 LoRA 模块
model = get_peft_model(model, config)
config

Loading checkpoint shards: 100%

 3/3 [00:01<00:00,  1.10it/s]

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/root/autodl-tmp/model/Qwen/Qwen3-4B-Thinking-2507', revision=None, inference_mode=False, r=8, target_modules={'k_proj', 'o_proj', 'down_proj', 'q_proj', 'up_proj', 'v_proj', 'gate_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None)
# 查看 lora 微调的模型参数
model.print_trainable_parameters()
trainable params: 16,515,072 || all params: 4,038,983,168 || trainable%: 0.4089
!pip install swanlab
# 配置 swanlab
import swanlab
from swanlab.integration.transformers import SwanLabCallback

swanlab.login(api_key='your api key', save=False)

# 实例化SwanLabCallback
swanlab_callback = SwanLabCallback(
    project="Qwen3-4B-lora", 
    experiment_name="Qwen3-4B-experiment"
)

swanlab 的 api key 可以通过登录官网注册账号获得:https://swanlab.cn/
这里贴入自己申请的key

from swanlab.integration.transformers import SwanLabCallback

# 配置训练参数
args = TrainingArguments(
    output_dir="./output/Qwen3_4B_lora", # 输出目录
    per_device_train_batch_size=16, # 每个设备上的训练批量大小
    gradient_accumulation_steps=2, # 梯度累积步数
    logging_steps=10, # 每10步打印一次日志
    num_train_epochs=3, # 训练轮数
    save_steps=100, # 每100步保存一次模型
    learning_rate=1e-4, # 学习率
    save_on_each_node=True, # 是否在每个节点上保存模型
    gradient_checkpointing=True, # 是否使用梯度检查点
    report_to="none", # 不使用任何报告工具
)
# 然后使用 trainer 训练即可
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback]
)

# 开始训练
trainer.train()
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = 'model/Qwen/Qwen3-4B-Instruct-2507'# 基座模型参数路径
lora_path = './output/Qwen3_4B_lora/checkpoint-351' # 这里改成你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True)

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)
prompt = "你是谁?"
inputs = tokenizer.apply_chat_template(
                                    [{"role": "user", "content": "假设你是皇帝身边的女人--甄嬛。"},{"role": "user", "content": prompt}],
                                    add_generation_prompt=True,
                                    tokenize=True,
                                    return_tensors="pt",
                                    return_dict=True,
                                    enable_thinking=False
                                )
inputs = inputs.to("cuda")


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

mode_path = '/root/autodl-tmp/model/Qwen/Qwen3-4B-Thinking-2507' # 基座模型参数路径
lora_path = './output/Qwen3_4B_lora/checkpoint-351' # 这里改成你的 lora 输出对应 checkpoint 地址

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(mode_path)

# 加载模型
model = AutoModelForCausalLM.from_pretrained(mode_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True)

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

prompt = "你是谁?"
inputs = tokenizer.apply_chat_template(
                                    [{"role": "user", "content": "假设你是皇帝身边的女人--甄嬛。"},{"role": "user", "content": prompt}],
                                    add_generation_prompt=True,
                                    tokenize=True,
                                    return_tensors="pt",
                                    return_dict=True,
                                    enable_thinking=False
                                )
inputs = inputs.to("cuda")


gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
我是甄嬛,家父是大理寺少卿甄远道。

作业

请大家选择一个 NLP 经典任务(例如情感分类、命名实体识别等),收集该任务的经典训练数据对 Qwen3-4B 进行微调,并评估微调后模型在该任务上的效果

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Qwen3-4B + LoRA 中文情感分类 完整微调脚本
运行前请确保:
  1  已安装 transformers>=4.40, peft, swanlab, datasets, accelerate
  2  模型已下载到 /root/autodl-tmp/model/Qwen/Qwen3-4B-Instruct-2507
  3  已设置 SwanLab API_KEY 环境变量(21 位字母数字)
"""

import os, torch, json, requests, warnings
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType
from swanlab.integration.transformers import SwanLabCallback
import swanlab

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ---------- 0 超参数 ----------
MODEL_PATH = "/root/autodl-tmp/model/Qwen/Qwen3-4B-Instruct-2507"
DATA_URL = "https://huggingface.co/datasets/seamew/ChnSentiCorp/resolve/main/ChnSentiCorp.jsonl"
MAX_LEN = 256
BATCH_SIZE = 4
GRAD_ACC = 4
EPOCHS = 1
LR = 2e-4
LORA_R = 64
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

# ---------- 1 下载数据 ----------
data_path = "ChnSentiCorp.jsonl"
if not os.path.exists(data_path):
    print("Downloading ChnSentiCorp...")
    with requests.get(DATA_URL, stream=True) as r:
        r.raise_for_status()
        with open(data_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

raw_ds = load_dataset("json", data_files=data_path, split="train").train_test_split(0.1)

# ---------- 2 加载 tokenizer & 模型 ----------
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    local_files_only=True
)

# ---------- 3 LoRA ----------
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"]
)
model = get_peft_model(model, lora_config)
model.enable_input_require_grads()
model.print_trainable_parameters()

# ---------- 4 数据格式 ----------
def fmt(example):
    prompt = f"Instruction:判断这条评论的情感倾向\nInput:{example['input']}\nOutput:"
    full = prompt + example['output'] + tokenizer.eos_token
    return tokenizer(full, truncation=True, max_length=MAX_LEN)

def map_ds(ds):
    ds = ds.map(fmt, remove_columns=ds.column_names)
    ds = ds.shuffle(seed=42)
    return ds

train_ds = map_ds(raw_ds["train"])
eval_ds = map_ds(raw_ds["test"])

# ---------- 5 SwanLab ----------
swanlab.login(api_key=os.getenv("SWANLAB_API_KEY"), save=False)
swanlab_callback = SwanLabCallback(project="Qwen3-4B-lora-sentiment")

# ---------- 6 Trainer ----------
args = TrainingArguments(
    output_dir="./qwen3_lora_sentiment",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    bf16=True,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to="none",  # 由 swanlab 接管
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8),
    callbacks=[swanlab_callback],
)

# ---------- 7 训练 ----------
trainer.train()

# ---------- 8 快速评估 ----------
from sklearn.metrics import accuracy_score

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds.argmax(dim=-1)
    mask = labels != -100
    preds = preds[mask].cpu().numpy()
    labels = labels[mask].cpu().numpy()
    return {"accuracy": accuracy_score(labels, preds)}

trainer.compute_metrics = compute_metrics
print(trainer.evaluate())

Logo

更多推荐