LLM基础之transformers源码一

原创已于 2025-04-27 21:38:00 修改 · 511 阅读 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

stund

关注

标签

#机器学习 #python #人工智能

分类人工智能

于 2025-04-26 20:40:42 首次发布

目前 llama factory 的训练是基于 transformers 模块的基础上实现的，后续如果有一些魔改需求，有必要先了解 transformers 模块的基础源码。

transformers 核心源码梳理

transformers.Trainer部分：

init() 初始化函数：


    def __init__(xxx):
        if args is None:
            output_dir = "tmp_trainer"
            args = TrainingArguments(output_dir=output_dir) 
        self.args = args
        self.compute_loss_func = compute_loss_func
        # Seed must be set before instantiating the model when using model
        enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)

        self.hp_name = None
        self.deepspeed = None
        self.is_in_train = False
        self.model = model
        self.create_accelerator_and_postprocess() # 初始化加速相关参数如: self.accelerator（Accelerator）

        self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics) # cpu/gpu监控器初始化，貌似需要先安装了 psutil 模块 
        self._memory_tracker.start()

        # set the correct log level depending on the node
        log_level = args.get_process_log_level()
        logging.set_verbosity(log_level)

        # force device and distributed setup init explicitly
        args._setup_devices

        if model is None:
            if model_init is not None: # 每次调train()生效的模型初始化函数。
                self.model_init = model_init
                model = self.call_model_init()
        xxxxx 

        if model.__class__.__name__ in MODEL_MAPPING_NAMES: # 模型类名不能是已有的如albert 
            raise ValueError(xxx)

        if getattr(model, "is_parallelizable", False) and getattr(model, "model_parallel", False):
            self.is_model_parallel = True 

        if getattr(model, "hf_device_map", None) is not None:
            devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]]
            if len(devices) > 1:
                self.is_model_parallel = True
            elif len(devices) == 1:
                self.is_model_parallel = self.args.device != torch.device(devices[0])
            else:
                self.is_model_parallel = False

            # warn users
            if self.is_model_parallel:
                logger.info(
                    "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set"
                    " to `True` to avoid any unexpected behavior such as device placement mismatching."
                )

        if self.args.use_liger_kernel: # Patch the model with liger kernels. 
            xxx 
 
        self.is_fsdp_xla_enabled = args.fsdp_config["xla"] # 核心功能是模型分片后存放各分片到不同的GPU上，而不是默认的一块GPU存放一个完整的模型，FSDP主要是为了解决传统数据并行（如DDP）在训练大模型时显存占用过高的问题

        # one place to sort out whether to place the model on device or not
        # postpone switching model to cuda when:
        # 1. MP - since we are trying to fit a much bigger than 1 gpu model
        # 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway,
        #    and we only use deepspeed for training at the moment
        # 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first
        # 4. FSDP - same as MP
        if (
            self.is_model_parallel
            or self.is_deepspeed_enabled
            or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
            or self.is_fsdp_xla_enabled
            or self.is_fsdp_enabled
        ):
            self.place_model_on_device = False

        default_collator = (
            DataCollatorWithPadding(processing_class)
            if processing_class is not None
            and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor))
            else default_data_collator
        )
        self.data_collator = data_collator if data_collator is not None else default_collator
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.processing_class = processing_class

        xxxx 

        # later use `self.model is self.model_wrapped` to check if it's wrapped or not
        self.model_wrapped = model
        self.model = model

        self.neftune_noise_alpha = args.neftune_noise_alpha

        self.compute_metrics = compute_metrics
        self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
        self.optimizer, self.lr_scheduler = optimizers
        self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs
        
        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
        callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
        self.callback_handler = CallbackHandler(
            callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler
        )
        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)

        # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
        self._loggers_initialized = False

        # Label smoothing : 自定义loss计算对齐逻辑等，下面计算自定义的loss时会涉及
        if self.args.label_smoothing_factor != 0:
            self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)

        self.control = TrainerControl() # 训练过程控制类，存储一些状态控制参数，与TrainerCallback搭配使用

        self.state = TrainerState( # 训练状态记录类，每次梯度更新时生效（gradient_accumulation_steps=1时每个batch后生效一次）
            is_local_process_zero=self.is_local_process_zero(),
            is_world_process_zero=self.is_world_process_zero(),
            stateful_callbacks=[
                cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState)
            ],
        )
        # Internal variable to count flos in each process, will be accumulated in `self.state.total_flos` then
        # returned to 0 every time flos need to be logged
        self.current_flos = 0
        self.hp_search_backend = None 
        default_label_names = find_labels(self.model.__class__) # 找label，基于torch的模型按forward()后返回 label 与否
        self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
        self.can_return_loss = can_return_loss(self.model.__class__) # 基于 forward()返回里包含 return_loss 与否
        self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)

        # Internal variables to help with automatic batch size reduction
        self._train_batch_size = args.train_batch_size
        self._created_lr_scheduler = False

        # very last
        self._memory_tracker.stop_and_update_metrics()

        self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False)
        self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled

train()函数：

train(self,
        resume_from_checkpoint: Optional[Union[str, bool]] = None,
        trial: Union["optuna.Trial", dict[str, Any], None] = None,
        ignore_keys_for_eval: Optional[list[str]] = None,
        **kwargs,
    ):  # 内部核心实现代码 
        self._memory_tracker.start()
        args = self.args
        self.is_in_train = True

        # This might change the seed so needs to run first.
        self._hp_search_setup(trial) # 超参搜索相关 
        # Model re-init 每次train()中重新执行 model_init 
        model_reloaded = False
        if self.model_init is not None:
            # Seed must be set before instantiating the model when using model_init.
            enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
            self.model = self.call_model_init(trial)
            model_reloaded = True
            # Reinitializes optimizer and scheduler
            self.optimizer, self.lr_scheduler = None, None
        
        # Load potential model checkpoint: 优先级比 model_init 更高
        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
            resume_from_checkpoint = get_last_checkpoint(args.output_dir)
        if resume_from_checkpoint is not None:
            if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled:
                self._load_from_checkpoint(resume_from_checkpoint)
    
        # If model was re-initialized, put it on the right device and update self.model_wrapped
        if model_reloaded:
            if self.place_model_on_device:
                self._move_model_to_device(self.model, args.device)
            self.model_wrapped = self.model

        inner_training_loop = find_executable_batch_size(self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size)
        return inner_training_loop(  # 该函数找到显存可承受的batch调用_inner_training_loop()训练
                args=args,
                resume_from_checkpoint=resume_from_checkpoint,
                trial=trial,
                ignore_keys_for_eval=ignore_keys_for_eval,
            )

_inner_training_loop()函数：

def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
    ):  # 核心代码展开
        self.accelerator.free_memory()
        self._train_batch_size = batch_size
    
        train_dataloader = self.get_train_dataloader() # 数据加载封装-含随机打散，需要时覆盖重写
        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size 
        (num_train_epochs, num_update_steps_per_epoch, xxx) = self.set_initial_training_values(args, train_dataloader, total_train_batch_size)  # 确定具体的训练数据
        num_train_tokens = None # 记录总训练的token数量

        if self.is_deepspeed_enabled:
            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)

        self.state = TrainerState(xxx) # 记录当前训练状态

        if args.gradient_checkpointing:
            self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs)
        model = self._wrap_model(self.model_wrapped)
        use_accelerator_prepare = True if model is self.model else False

        if delay_optimizer_creation:
            xxx
            self.create_optimizer_and_scheduler(num_training_steps=max_steps) # 设置优化器和lr调度器
        # prepare using `accelerator` prepare
        if use_accelerator_prepare:
            self.model.train()
            xxx
        elif self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]:           # In this case we are in DDP + LOMO, which should be supported
            self.optimizer = self.accelerator.prepare(self.optimizer)

        # Check if saved optimizer or scheduler states exist
        self._load_optimizer_and_scheduler(resume_from_checkpoint)
        self._load_scaler(resume_from_checkpoint)    
        logger.info("***** Running training *****")

        # Check if continuing training from a checkpoint
        if resume_from_checkpoint is not None and os.path.isfile(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME=trainer_state.json)
        ):    
            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
            self.compare_trainer_and_checkpoint_args(self.args, self.state)
            self._load_callback_state()  # 添加回调到 self.callback_handler
        xxx
        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)  # 训练开始前
        for epoch in range(epochs_trained, num_train_epochs):
            epoch_dataloader = train_dataloader
            xxx
            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) # epoch 开始前
            total_updates = steps_in_epoch // args.gradient_accumulation_steps + 1
            if args.gradient_accumulation_steps == 1:
                total_updates -= 1

            for _ in range(total_updates):
                update_step += 1
                num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
                batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
                for i, inputs in enumerate(batch_samples):
                    step += 1
                    do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
                    # Since we perform prefetching, we need to manually set sync_gradients
                    self.accelerator.gradient_state._set_sync_gradients(do_sync_step)

                    xxx

                    if step % args.gradient_accumulation_steps == 0:
                        self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
                    with context():
                        tr_loss_step = self.training_step(model, inputs, num_items_in_batch). # 一次训练

                    if xxx:
                        tr_loss = tr_loss + tr_loss_step

                    if do_sync_step:
                        self.control = self.callback_handler.on_pre_optimizer_step(args, self.state, self.control)
                        self.optimizer.step()
                        self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control)
                        # get leaning rate before update
                        learning_rate = self._get_learning_rate()
                        self.control = self.callback_handler.on_step_end(args, self.state, self.control)
                        self._maybe_log_save_evaluate(xxx) 

            if self.control.should_training_stop:
                break
        xxx 

        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
            # Wait for everyone to get here so we are sure the model has been saved by process 0.
            xxx 
            self._load_best_model()

        # add remaining tr_loss
        self._total_loss_scalar += tr_loss.item()
        effective_global_step = max(self.state.global_step, 0.001)  # Avoid ZeroDivisionError
        train_loss = self._total_loss_scalar / effective_global_step

training_step() 函数：

## Perform a training step on a batch of inputs. 必要时重载覆盖重写
    def training_step(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None):   
        model.train()  # 调起训练
        if hasattr(self.optimizer, "train") and callable(self.optimizer.train):
            self.optimizer.train()

        inputs = self._prepare_inputs(inputs) # 将输入k:v转化为训练的tensor
        if is_sagemaker_mp_enabled():  #"partitions":模型并行时不成立。 返回的 _smdistributed_available 的值：计算是否配置了smdistributed
            # 优化后直接返回，不涉及调用 compute_loss()。 
            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
            return loss_mb.reduce_mean().detach().to(self.args.device)

        with self.compute_loss_context_manager(): # group together context managers.
            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) # 内部的函数计算loss

        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss, **kwargs)

            return loss.detach()

compute_loss() 函数：

    # defined How the loss is computed by Trainer. By default, all models return the loss in the first element.
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
            labels = inputs.pop("labels") # 决定下面是否计算loss
        xxx
        outputs = model(**inputs)
        if labels is not None: # 重新计算loss
            if self.compute_loss_func is not None:
                loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch) # 调用自定义的 loss func
            elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        if (xxx):
            loss *= self.accelerator.num_processes
        return (loss, outputs) if return_outputs else loss