llama3.1-8B-微调

库中用于特定任务（例如微调）的训练类。代码配置了训练参数、数据集和模型，并为训练过程定义了一些关键设置。3）应用LoRA技术来减少模型训练和推理时的内存使用，同时保持模型性能。进行模型推理，生成一个给定指令的响应。9）推送模型到hugging-face。4）加载一个数据集，并规范格式。实例，用于训练一个语言模型。使用unslothai微调。10)加在本地模型进行推理。

fc&&fl

1900人浏览 · 2024-07-26 15:09:49

fc&&fl · 2024-07-26 15:09:49 发布

https://github.com/unslothai/unsloth

使用unslothai微调

1）准备环境

%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

2）加载模型
记得先

!pip install pyarrow==8.0.0

不然会报错

from unsloth import FastLanguageModel
import torch

# 配置参数
max_seq_length = 2048  # 可以选择任意长度！我们内部自动支持RoPE Scaling
dtype = None  # None表示自动检测。对于Tesla T4、V100选择Float16，Ampere+选择Bfloat16
load_in_4bit = True  # 使用4bit量化以减少内存使用。可以设为False

# 支持的4bit预量化模型，可实现4倍下载速度提升且无OOM（内存溢出）问题
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15万亿tokens模型，速度提升2倍！
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 我们还上传了405b的4bit版本！
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # 新Mistral 12b，速度提升2倍！
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3，速度提升2倍！
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",          # Phi-3，速度提升2倍！
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma，速度提升2倍！
]  # 更多模型见 https://huggingface.co/unsloth

# 加载模型和tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...",  # 如果使用需要访问权限的模型（如meta-llama/Llama-2-7b-hf），请添加token
)

3）应用LoRA技术来减少模型训练和推理时的内存使用，同时保持模型性能。

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # 选择任意大于0的数值，建议使用8、16、32、64、128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # 支持任何值，但设置为0是优化的
    bias="none",    # 支持任何值，但设置为"none"是优化的
    use_gradient_checkpointing="unsloth",  # 设置为True或"unsloth"以支持超长上下文
    random_state=3407,
    use_rslora=False,  # 支持使用rank stabilized LoRA
    loftq_config=None,  # 以及LoftQ配置
)

4）加载一个数据集，并规范格式

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("leo009/mytest", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

5）初始化了一个 SFTTrainer 实例，用于训练一个语言模型。SFTTrainer 是一种在 trl 库中用于特定任务（例如微调）的训练类。代码配置了训练参数、数据集和模型，并为训练过程定义了一些关键设置。

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

6）开始训练

trainer_stats = trainer.train()

7）使用 FastLanguageModel 进行模型推理，生成一个给定指令的响应。

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "介绍AI超元域频道.", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

8）保存模型

model.save_pretrained("lora_model") # Local saving

tokenizer.save_pretrained("lora_model")

9）推送模型到hugging-face

model.push_to_hub("fengn/llama3.1-lora", token = "") # Online saving
tokenizer.push_to_hub("fengn/llama3.1-lora", token = "") # Online saving

10)加在本地模型进行推理

if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

inputs = tokenizer(
[
    alpaca_prompt.format(
        "AI超元域是谁?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

11）

16-bit 和 4-bit 合并保存: 通过将模型保存为 16-bit 或 4-bit 格式来优化存储和计算效率。
仅 LoRA 适配器保存: 只保存 LoRA 适配器而不是整个模型，这对共享或部署经过特定任务微调的适配器非常有用。

# Merge to 16bit
if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if True: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

12）量化

# Save to 8bit Q8_0
if True: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if True: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if True:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )