为什么需要微调
现在开源的预训练模型直接使用,有时候很难达到预期效果,预训练模型训练进去的知识没有包含企业业务的特殊数据,使用特定任务的小规模数据集对模型进行进一步训练,以适应特定的任务或领域
基于Hungging face 的模型进行微调
训练前,使用未进行微调的预训练BERT模型进行情感预测
该类型模型主要用于文本分类、问答系统、文本生成等
# 使用pipeline进行预测
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
config = AutoConfig.from_pretrained("bert-base-uncased", num_labels=6)
# 定义标签映射
label_mapping = {
"LABEL_0": "sadness",
"LABEL_1": "joy",
"LABEL_2": "love",
"LABEL_3": "anger",
"LABEL_4": "fear",
"LABEL_5": "surprise"
}
classifier = pipeline(
'text-classification',
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
config=config,
ignore_mismatched_sizes=True # 忽略分类头尺寸不匹配
),
tokenizer=AutoTokenizer.from_pretrained('bert-base-uncased'),
# 使用训练后的本地模型
# model=AutoModelForSequenceClassification.from_pretrained('./emotion_bert_model'),
# tokenizer=AutoTokenizer.from_pretrained('./emotion_bert_model'),
)
# 测试样本
test_samples = [
"I feel absolutely delighted and overjoyed today!"
]
# 进行预测
print("\n预测示例:")
for sample in test_samples:
result = classifier(sample)
print(f"文本: {sample}")
print(f"预测情感: {label_mapping[result[0]['label']]} (置信度: {result[0]['score']:.4f})")
print()

训练代码
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer,DataCollatorWithPadding,AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
#加载emotion数据集
emotions = load_dataset("emotion")
# 加载BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
label_mapping = {
0: "sadness",
1: "joy",
2: "love",
3: "anger",
4: "fear",
5: "surprise"
}
#定义预处理函数
def preprocess_function(examples):
return tokenizer(examples['text'], truncation=True)
# 添加可读标签
def label_int2str(row):
return emotions["train"].features["label"].int2str(row)
# 添加新列(无需转为pandas)
emotions = emotions.map(
lambda example: {"label_name": label_int2str(example["label"])},
batched=False
)
# 应用预处理
tokenized_dataset = emotions.map(preprocess_function, batched=True)
# 创建动态填充的data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
#划分验证集
# train_dataset = tokenized_dataset['train']
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(int(0.2 * len(tokenized_dataset["train"]))))
# eval_dataset = tokenized_dataset['validation']
small_eval_dataset = tokenized_dataset["validation"].shuffle(seed=42).select(range(int(0.2 * len(tokenized_dataset["validation"]))))
# test_dataset = tokenized_dataset['test']
small_test_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(int(0.2 * len(tokenized_dataset["test"]))))
# 加载BERT模型
model = AutoModelForSequenceClassification.from_pretrained(
'bert-base-uncased',
num_labels=6,
id2label=label_mapping,
label2id={v: k for k, v in label_mapping.items()}
)
# 定义评估指标
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {
'accuracy': accuracy_score(labels, predictions),
'f1_macro': f1_score(labels, predictions, average='macro')
}
training_args = TrainingArguments(
output_dir='./emotion_results',
num_train_epochs=1, # ↓ 减少训练轮次
gradient_accumulation_steps=2, # 模拟更大batch size
learning_rate=3e-5, # ↑ 适当提高学习率
evaluation_strategy='steps', # 改为按步评估
#eval_steps=200, # 每200步评估一次
save_strategy='steps',
logging_steps=50,
report_to='none'
)
# 创建Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics
)
# 开始训练
trainer.train()
# 保存模型
model.save_pretrained('./emotion_bert_model')
tokenizer.save_pretrained('./emotion_bert_model')
训练过程
训练后

训练的样本比较少,置信度(得分)虽然不高,但是可以明显看出来有提升,且预测结果正确
总结
提示词、RAG、微调这三种技术一般都不会独立应用而是针对需要进行融合改进



2839

被折叠的 条评论
为什么被折叠?



