目前 llama factory 的训练是基于 transformers 模块的基础上实现的,后续如果有一些魔改需求,有必要先了解 transformers 模块的基础源码。
transformers 核心源码梳理
transformers.Trainer部分:
__init__() 初始化函数:
def __init__(xxx):
if args is None:
output_dir = "tmp_trainer"
args = TrainingArguments(output_dir=output_dir)
self.args = args
self.compute_loss_func = compute_loss_func
# Seed must be set before instantiating the model when using model
enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
self.hp_name = None
self.deepspeed = None
self.is_in_train = False
self.model = model
self.create_accelerator_and_postprocess() # 初始化加速相关参数如: self.accelerator(Accelerator)
self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics) # cpu/gpu监控器初始化,貌似需要先安装了 psutil 模块
self._memory_tracker.start()
# set the correct log level depending on the node
log_level = args.get_process_log_level()
logging.set_verbosity(log_level)
# force device and distributed setup init explicitly
args._setup_devices
if model is None:
if model_init is not None: # 每次调train()生效的模型初始化函数。
self.model_init = model_init
model = self.call_model_init()
xxxxx
if model.__class__.__name__ in MODEL_MAPPING_NAMES: # 模型类名不能是已有的如albert
raise ValueError(xxx)
if getattr(model, "is_parallelizable", False) and getattr(model, "model_parallel", False):
self.is_model_parallel = True
if getattr(model, "hf_device_map", None) is not None:
devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
if len(devices) > 1:
self.is_model_parallel = True
elif len(devices) == 1:
self.is_model_parallel = self.args.device != torch.device(devices[0])
else:
self.is_model_parallel = False
# warn users
if self.is_model_parallel:
logger.info(
"You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set"
" to `True` to avoid any unexpected behavior such as device placement mismatching."
)
if self.args.use_liger_kernel: # Patch the model with liger kernels.
xxx
self.is_fsdp_xla_enabled = args.fsdp_config["xla"] # 核心功能是模型分片后存放各分片到不同的GPU上,而不是默认的一块GPU存放一个完整的模型,FSDP主要是为了解决传统数据并行(如DDP)在训练大模型时显存占用过高的问题
# one place to sort out whether to place the model on device or not
# postpone switching model to cuda when:
# 1. MP - since we are trying to fit a much bigger than 1 gpu model
# 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway,
# and we only use deepspeed for training at the moment
# 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first
# 4. FSDP - same as MP
if (
self.is_model_parallel
or self.is_deepspeed_enabled
or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
or self.is_fsdp_xla_enabled
or self.is_fsdp_enabled
):
self.place_model_on_device = False
default_collator = (
DataCollatorWithPadding(processing_class)
if processing_class is not None
and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
else default_data_collator
)
self.data_collator = data_collator if data_collator is not None else default_collator
self.train_dataset = train_dataset
self.eval_dataset = eval_dataset
self.processing_class = processing_class
xxxx
# later use `self.model is self.model_wrapped` to check if it's wrapped or not
self.model_wrapped = model
self.model = model
self.neftune_noise_alpha = args.neftune_noise_alpha
self.compute_metrics = compute_metrics
self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
self.optimizer, self.lr_scheduler = optimizers
self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
self.callback_handler = CallbackHandler(
callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
)
self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
# Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
self._loggers_initialized = False
# Label smoothing : 自定义loss计算对齐逻辑等,下面计算自定义的loss时会涉及
if self.args.label_smoothing_factor != 0:
self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
self.control = TrainerControl() # 训练过程控制类,存储一些状态控制参数,与TrainerCallback搭配使用
self.state = TrainerState( # 训练状态记录类,每次梯度更新时生效(gradient_accumulation_steps=1时每个batch后生效一次)
is_local_process_zero=self.is_local_process_zero(),
is_world_process_zero=self.is_world_process_zero(),
stateful_callbacks=[
cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
],
)
# Internal variable to count flos in each process, will be accumulated in `self.state.total_flos` then
# returned to 0 every time flos need to be logged
self.current_flos = 0
self.hp_search_backend = None
default_label_names = find_labels(self.model.__class__) # 找label,基于torch的模型按forward()后返回 label 与否
self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
self.can_return_loss = can_return_loss(self.model.__class__) # 基于 forward()返回里包含 return_loss 与否
self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
# Internal variables to help with automatic batch size reduction
self._train_batch_size = args.train_batch_size
self._created_lr_scheduler = False
# very last
self._memory_tracker.stop_and_update_metrics()
self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False)
self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled
train()函数:
train(self,
resume_from_checkpoint: Optional[Union[str, bool]] = None,
trial: Union["optuna.Trial", dict[str, Any], None] = None,
ignore_keys_for_eval: Optional[list[str]] = None,
**kwargs,
): # 内部核心实现代码
self._memory_tracker.start()
args = self.args
self.is_in_train = True
# This might change the seed so needs to run first.
self._hp_search_setup(trial) # 超参搜索相关
# Model re-init 每次train()中重新执行 model_init
model_reloaded = False
if self.model_init is not None:
# Seed must be set before instantiating the model when using model_init.
enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
self.model = self.call_model_init(trial)
model_reloaded = True
# Reinitializes optimizer and scheduler
self.optimizer, self.lr_scheduler = None, None
# Load potential model checkpoint: 优先级比 model_init 更高
if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
resume_from_checkpoint = get_last_checkpoint(args.output_dir)
if resume_from_checkpoint is not None:
if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
self._load_from_checkpoint(resume_from_checkpoint)
# If model was re-initialized, put it on the right device and update self.model_wrapped
if model_reloaded:
if self.place_model_on_device:
self._move_model_to_device(self.model, args.device)
self.model_wrapped = self.model
inner_training_loop = find_executable_batch_size(self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size)
return inner_training_loop( # 该函数找到显存可承受的batch调用_inner_training_loop()训练
args=args,
resume_from_checkpoint=resume_from_checkpoint,
trial=trial,
ignore_keys_for_eval=ignore_keys_for_eval,
)
_inner_training_loop()函数:
def _inner_training_loop(
self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
): # 核心代码展开
self.accelerator.free_memory()
self._train_batch_size = batch_size
train_dataloader = self.get_train_dataloader() # 数据加载封装-含随机打散,需要时覆盖重写
total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
(num_train_epochs, num_update_steps_per_epoch, xxx) = self.set_initial_training_values(args, train_dataloader, total_train_batch_size) # 确定具体的训练数据
num_train_tokens = None # 记录总训练的token数量
if self.is_deepspeed_enabled:
self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
self.state = TrainerState(xxx) # 记录当前训练状态
if args.gradient_checkpointing:
self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs)
model = self._wrap_model(self.model_wrapped)
use_accelerator_prepare = True if model is self.model else False
if delay_optimizer_creation:
xxx
self.create_optimizer_and_scheduler(num_training_steps=max_steps) # 设置优化器和lr调度器
# prepare using `accelerator` prepare
if use_accelerator_prepare:
self.model.train()
xxx
elif self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: # In this case we are in DDP + LOMO, which should be supported
self.optimizer = self.accelerator.prepare(self.optimizer)
# Check if saved optimizer or scheduler states exist
self._load_optimizer_and_scheduler(resume_from_checkpoint)
self._load_scaler(resume_from_checkpoint)
logger.info("***** Running training *****")
# Check if continuing training from a checkpoint
if resume_from_checkpoint is not None and os.path.isfile(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME=trainer_state.json)
):
self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
self.compare_trainer_and_checkpoint_args(self.args, self.state)
self._load_callback_state() # 添加回调到 self.callback_handler
xxx
self.control = self.callback_handler.on_train_begin(args, self.state, self.control) # 训练开始前
for epoch in range(epochs_trained, num_train_epochs):
epoch_dataloader = train_dataloader
xxx
self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) # epoch 开始前
total_updates = steps_in_epoch // args.gradient_accumulation_steps + 1
if args.gradient_accumulation_steps == 1:
total_updates -= 1
for _ in range(total_updates):
update_step += 1
num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
for i, inputs in enumerate(batch_samples):
step += 1
do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
# Since we perform prefetching, we need to manually set sync_gradients
self.accelerator.gradient_state._set_sync_gradients(do_sync_step)
xxx
if step % args.gradient_accumulation_steps == 0:
self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
with context():
tr_loss_step = self.training_step(model, inputs, num_items_in_batch). # 一次训练
if xxx:
tr_loss = tr_loss + tr_loss_step
if do_sync_step:
self.control = self.callback_handler.on_pre_optimizer_step(args, self.state, self.control)
self.optimizer.step()
self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control)
# get leaning rate before update
learning_rate = self._get_learning_rate()
self.control = self.callback_handler.on_step_end(args, self.state, self.control)
self._maybe_log_save_evaluate(xxx)
if self.control.should_training_stop:
break
xxx
logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
# Wait for everyone to get here so we are sure the model has been saved by process 0.
xxx
self._load_best_model()
# add remaining tr_loss
self._total_loss_scalar += tr_loss.item()
effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError
train_loss = self._total_loss_scalar / effective_global_step
training_step() 函数:
## Perform a training step on a batch of inputs. 必要时重载覆盖重写
def training_step(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None):
model.train() # 调起训练
if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
self.optimizer.train()
inputs = self._prepare_inputs(inputs) # 将输入k:v转化为训练的tensor
if is_sagemaker_mp_enabled(): #"partitions":模型并行时不成立。 返回的 _smdistributed_available 的值:计算是否配置了smdistributed
# 优化后直接返回,不涉及调用 compute_loss()。
loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
return loss_mb.reduce_mean().detach().to(self.args.device)
with self.compute_loss_context_manager(): # group together context managers.
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) # 内部的函数计算loss
if self.args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if self.use_apex:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
self.accelerator.backward(loss, **kwargs)
return loss.detach()
compute_loss() 函数:
# defined How the loss is computed by Trainer. By default, all models return the loss in the first element.
def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
labels = inputs.pop("labels") # 决定下面是否计算loss
xxx
outputs = model(**inputs)
if labels is not None: # 重新计算loss
if self.compute_loss_func is not None:
loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch) # 调用自定义的 loss func
elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
loss = self.label_smoother(outputs, labels, shift_labels=True)
else:
loss = self.label_smoother(outputs, labels)
else:
loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
if (xxx):
loss *= self.accelerator.num_processes
return (loss, outputs) if return_outputs else loss



205

被折叠的 条评论
为什么被折叠?



