微调大模型

直接推理

Llama3

import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Do you know Genshin Impact?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

Qwen2.5

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "给我简单介绍一下大语言模型"
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Qwen2.5 VL

使用Qwen2.5 VL识别一张图片

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import cv2
import os

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# 将输入的图片进行resize, 图片尺寸越大占用的显存越多
image = cv2.imread('assets/pgfjebp_4K_Albedo.jpg')
MAX_SIZE = 1280
height, width = image.shape[:2]
scale_factor = min(MAX_SIZE / width, MAX_SIZE / height)
new_width = int(width * scale_factor)
new_height = int(height * scale_factor)
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
os.makedirs('temp', exist_ok=True)
cv2.imwrite('temp/test.jpg', resized_image)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "file://temp/test.jpg",
            },
            {"type": "text", "text": "Determine which of the following type this image is: Normal map, Albedo map, Roughness map. Only response type"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

vllm部署

vllm serve Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 7861

openai api

from openai import OpenAI

openai_api_key = "EMPTY"
openai_api_base = "xxx/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
    model="Qwen/Qwen2.5-7B-Instruct",
    messages=[
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": "Tell me something about large language models."},
    ],
    temperature=0.7,
    top_p=0.8,
    max_tokens=512,
    extra_body={
        "repetition_penalty": 1.05,
    },
    extra_headers=headers
)
print("Chat response:", chat_response)

SFT

可以用于学习知识或能力

微调大模型有很多框架，Megatron很专业但门槛比较高，unsloth、LLaMA-Factory很玩具但比较适合新手

unsloth

这是一个模型sft框架，用起来很方便，但免费版只能单机单卡

参考Llama3.1_(8B)

训练结束后会在outputs文件夹中会保存一个checkpoint

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

model_id = "xxx"
# 加载模型
model, tokenizer = FastLanguageModel.from_pretrained(...)
# 转为PEFT模型，准备训练LoRA
model = FastLanguageModel.get_peft_model(...)

# 将数据集转为alpaca格式
alpaca_prompt = """..."""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# 加载数据集
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
# 处理数据集
dataset = dataset.map(formatting_prompts_func, batched = True,)

# 配置训练参数
training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    # num_train_epochs = 1, # Set this for 1 full training run.
    max_steps = 60,
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    report_to = "none", # Use this for WandB etc
)
# 训练
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args,
)
trainer_stats = trainer.train()
# 保存LoRA
model.save_pretrained("lora_model") 
tokenizer.save_pretrained("lora_model")

测试checkpoint

# 使用本地模型
model_id = "outputs/checkpoint-60"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "Give three tips for staying healthy"},
    {"role": "user", "content": "Do you know Genshin Impact?"},
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

输出结果中很显然背了了很多医学知识

LLaMA-Factory

一个开源LLM sft框架，可以实现多机多卡，但感觉跟unsloth比更玩具了

安装

git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]"

llamafactory-cli version

sft

accelerate launch --config_file deepspeed.config src/train.py llm_config.yaml

# deepspeed.config
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
    deepspeed_multinode_launcher: standard
    gradient_accumulation_steps: 8
    offload_optimizer_device: none
    offload_param_device: none
    zero3_init_flag: false
    zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_process_ip: '100.96.220.190'
main_process_port: 29500
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

# llm_config.yaml
model_name_or_path: Qwen/Qwen2.5-7B-Instruct

stage: sft
do_train: true
finetuning_type: lora
lora_target: all

dataset: alpaca_zh_demo
template: qwen
cutoff_len: 1024
max_samples: 1000
overwrite_cache: true
preprocessing_num_workers: 16

output_dir: saves/qwen-8b/lora/sft
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true

per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

val_size: 0.1
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 500

Reasoning

deepseek爆火后最流行的模型结构，可以长思维链思考，用RL训练

unsloth

unsloth提供了一种基于GRPO的训练方法

import torch
import re
from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
from datasets import load_dataset, Dataset
from trl import GRPOConfig, GRPOTrainer
# 修改FastLanguageModel，使其支持GRPO
PatchFastRL("GRPO", FastLanguageModel)
# 加载模型
model, tokenizer = FastLanguageModel.from_pretrained(...)
# 转为PEFT来训LoRA
model = FastLanguageModel.get_peft_model(...)
# 加载数据集
dataset = get_gsm8k_questions()
# 训练配置
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 8, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "tensorboard", # Can use Weights & Biases
    output_dir = "outputs",
)
# 训练，reward_funcs是RL的奖励函数，这里是与text结构有关
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset
)
trainer.train()
# 保存LoRA
model.save_lora("grpo_saved_lora")

测试LoRA

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig

model_name = "Qwen/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

lora_config = LoraConfig(
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=64
)
model.add_adapter(lora_config, adapter_name="grpo_saved_lora")

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

prompt = "How many r's are in strawberry?"
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)