抱歉,您的浏览器无法访问本站
本页面需要浏览器支持(启用)JavaScript
了解详情 >

微调大模型

直接推理

Llama3

import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto",
)

messages = [
{"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
{"role": "user", "content": "Do you know Genshin Impact?"},
]

outputs = pipeline(
messages,
max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

Qwen2.5

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "给我简单介绍一下大语言模型"
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Qwen2.5 VL

使用Qwen2.5 VL识别一张图片

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import cv2
import os

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# 将输入的图片进行resize, 图片尺寸越大占用的显存越多
image = cv2.imread('assets/pgfjebp_4K_Albedo.jpg')
MAX_SIZE = 1280
height, width = image.shape[:2]
scale_factor = min(MAX_SIZE / width, MAX_SIZE / height)
new_width = int(width * scale_factor)
new_height = int(height * scale_factor)
resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
os.makedirs('temp', exist_ok=True)
cv2.imwrite('temp/test.jpg', resized_image)

messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "file://temp/test.jpg",
},
{"type": "text", "text": "Determine which of the following type this image is: Normal map, Albedo map, Roughness map. Only response type"},
],
}
]

# Preparation for inference
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

vllm部署

vllm serve Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 7861

openai api

from openai import OpenAI

openai_api_key = "EMPTY"
openai_api_base = "xxx/v1"

client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
model="Qwen/Qwen2.5-7B-Instruct",
messages=[
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": "Tell me something about large language models."},
],
temperature=0.7,
top_p=0.8,
max_tokens=512,
extra_body={
"repetition_penalty": 1.05,
},
extra_headers=headers
)
print("Chat response:", chat_response)

SFT

可以用于学习知识或能力

微调大模型有很多框架,Megatron很专业但门槛比较高,unsloth、LLaMA-Factory很玩具但比较适合新手

unsloth

这是一个模型sft框架,用起来很方便,但免费版只能单机单卡

参考Llama3.1_(8B)

训练结束后会在outputs文件夹中会保存一个checkpoint

import torch
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

model_id = "xxx"
# 加载模型
model, tokenizer = FastLanguageModel.from_pretrained(...)
# 转为PEFT模型,准备训练LoRA
model = FastLanguageModel.get_peft_model(...)

# 将数据集转为alpaca格式
alpaca_prompt = """..."""
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
instructions = examples["instruction"]
inputs = examples["input"]
outputs = examples["output"]
texts = []
for instruction, input, output in zip(instructions, inputs, outputs):
# Must add EOS_TOKEN, otherwise your generation will go on forever!
text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
texts.append(text)
return { "text" : texts, }
pass

# 加载数据集
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
# 处理数据集
dataset = dataset.map(formatting_prompts_func, batched = True,)

# 配置训练参数
training_args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
# num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 60,
learning_rate = 2e-4,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
report_to = "none", # Use this for WandB etc
)
# 训练
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
args = training_args,
)
trainer_stats = trainer.train()
# 保存LoRA
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

测试checkpoint

# 使用本地模型
model_id = "outputs/checkpoint-60"

pipeline = transformers.pipeline(
"text-generation",
model=model_id,
model_kwargs={"torch_dtype": torch.bfloat16},
device_map="auto",
)

messages = [
{"role": "system", "content": "Give three tips for staying healthy"},
{"role": "user", "content": "Do you know Genshin Impact?"},
]

outputs = pipeline(
messages,
max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])

输出结果中很显然背了了很多医学知识

LLaMA-Factory

一个开源LLM sft框架,可以实现多机多卡,但感觉跟unsloth比更玩具了

安装

git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
cd LLaMA-Factory
pip install -e ".[torch,metrics]"
llamafactory-cli version

sft

accelerate launch --config_file deepspeed.config src/train.py llm_config.yaml
# deepspeed.config
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 8
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
enable_cpu_affinity: false
machine_rank: 0
main_process_ip: '100.96.220.190'
main_process_port: 29500
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
# llm_config.yaml
model_name_or_path: Qwen/Qwen2.5-7B-Instruct

stage: sft
do_train: true
finetuning_type: lora
lora_target: all

dataset: alpaca_zh_demo
template: qwen
cutoff_len: 1024
max_samples: 1000
overwrite_cache: true
preprocessing_num_workers: 16

output_dir: saves/qwen-8b/lora/sft
logging_steps: 10
save_steps: 500
plot_loss: true
overwrite_output_dir: true

per_device_train_batch_size: 1
gradient_accumulation_steps: 8
learning_rate: 1.0e-4
num_train_epochs: 3.0
lr_scheduler_type: cosine
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000

val_size: 0.1
per_device_eval_batch_size: 1
eval_strategy: steps
eval_steps: 500

Reasoning

deepseek爆火后最流行的模型结构,可以长思维链思考,用RL训练

unsloth

unsloth提供了一种基于GRPO的训练方法

import torch
import re
from unsloth import FastLanguageModel, PatchFastRL, is_bfloat16_supported
from datasets import load_dataset, Dataset
from trl import GRPOConfig, GRPOTrainer
# 修改FastLanguageModel,使其支持GRPO
PatchFastRL("GRPO", FastLanguageModel)
# 加载模型
model, tokenizer = FastLanguageModel.from_pretrained(...)
# 转为PEFT来训LoRA
model = FastLanguageModel.get_peft_model(...)
# 加载数据集
dataset = get_gsm8k_questions()
# 训练配置
training_args = GRPOConfig(
use_vllm = True, # use vLLM for fast inference!
learning_rate = 5e-6,
adam_beta1 = 0.9,
adam_beta2 = 0.99,
weight_decay = 0.1,
warmup_ratio = 0.1,
lr_scheduler_type = "cosine",
optim = "adamw_8bit",
logging_steps = 1,
bf16 = is_bfloat16_supported(),
fp16 = not is_bfloat16_supported(),
per_device_train_batch_size = 1,
gradient_accumulation_steps = 1, # Increase to 4 for smoother training
num_generations = 8, # Decrease if out of memory
max_prompt_length = 256,
max_completion_length = 200,
# num_train_epochs = 1, # Set to 1 for a full training run
max_steps = 250,
save_steps = 250,
max_grad_norm = 0.1,
report_to = "tensorboard", # Can use Weights & Biases
output_dir = "outputs",
)
# 训练,reward_funcs是RL的奖励函数,这里是与text结构有关
trainer = GRPOTrainer(
model = model,
processing_class = tokenizer,
reward_funcs = [
xmlcount_reward_func,
soft_format_reward_func,
strict_format_reward_func,
int_reward_func,
correctness_reward_func,
],
args = training_args,
train_dataset = dataset
)
trainer.train()
# 保存LoRA
model.save_lora("grpo_saved_lora")

测试LoRA

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig

model_name = "Qwen/Qwen2.5-3B-Instruct"

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

lora_config = LoraConfig(
r=64,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=64
)
model.add_adapter(lora_config, adapter_name="grpo_saved_lora")

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

prompt = "How many r's are in strawberry?"
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
**model_inputs,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

评论