缺页异常处理-ebpf统计缺页时间

原创

已于 2024-04-22 09:31:55 修改 · 1.1k 阅读

标签

#linux

于 2023-10-24 10:25:30 首次发布

缺页异常处理

缺页异常如何引起？

当一个进程执行时，如果cpu访问到一个有效的虚地址，但是此地址对应的物理页没有在内存中，那么cpu就会产生缺页异常。

可能引起的情况：

（1）访问用户栈的时，超出了当前用户栈的范围，需要扩大用户栈

（2）当进程申请虚拟内存区域的时候，没有分配物理页，进程第一次访问的时候触发页错误异常

（3）内存不足的时候，采用交换机制内存将会把进程的匿名页换出到交换区

（4）一个文件页被映射到进程的虚拟地址空间，内存不足时，内核回收这个文件页，在进程的页表中删除这个文件的映射程序错误，访问没有分配给进程的虚拟内存区域，将会发出SIGSEGV信号将进程杀死。

源码分析5.0版本

缺页异常处理首先是用do_page_fault()函数读取缺页的虚地址，如果没有找到则访问了非法虚地址，将会发出SIGSEGV信号终止当前进程。否则进行缺页类型检查，地址越界或者段错误同样终止此次进程。

do_page_fault

路径：arch/x86/mm/fault.c，处理页面异常

do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
   
   //处理页错误
	unsigned long address = read_cr2(); //将缺页异常的地址默认存放在cr2寄存器中，由x86硬件决定
	enum ctx_state prev_state;//保存上下文状态

	prev_state = exception_enter();//获取当前页异常处理的上下文状态
	if (trace_pagefault_enabled())//如果页错误追踪功能已经开启则调用trace_page_fault_entries，记录页错误信息
		trace_page_fault_entries(address, regs, error_code);//trac跟踪page_fault

	__do_page_fault(regs, error_code, address);//处理页错误
	exception_exit(prev_state);//退出异常处理状态
}

__do_page_fault

处理页面异常具体函数，处理页错误

__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
		unsigned long address)
{
   
   
	prefetchw(&current->mm->mmap_sem);//获取当前mmap_sem信号量，用于同步内存映射

	if (unlikely(kmmio_fault(regs, address)))//mmio区域不应该发生缺页，通常都会ioremap到vmalloc区，然后进行访问
		return;

	/* Was the fault on kernel-controlled part of the address space? */
	if (unlikely(fault_in_kernel_space(address)))//检查当前页错误是否发生在内核地址空间
		do_kern_addr_fault(regs, hw_error_code, address);//处理内核空间异常
	else
		do_user_addr_fault(regs, hw_error_code, address);//处理用户空间异常
}

do_user_addr_fault

处理用户空间的缺页异常

void do_user_addr_fault(struct pt_regs *regs,
			unsigned long hw_error_code,
			unsigned long address)
{
   
   
	struct vm_area_struct *vma;
	struct task_struct *tsk;
	struct mm_struct *mm;
	vm_fault_t fault, major = 0;//记录页错误
	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;//页错误标志

	tsk = current;
	mm = tsk->mm;

	/* kprobes don't want to hook the spurious faults: */
	if (unlikely(kprobes_fault(regs)))//kprobes模块出错
		return;

	/*
	 * Reserved bits are never expected to be set on
	 * entries in the user portion of the page tables.
	 */
	if (unlikely(hw_error_code & X86_PF_RSVD))//硬件错误代码
		pgtable_bad(regs, hw_error_code, address);

	/*
	 * If SMAP is on, check for invalid kernel (supervisor) access to user
	 * pages in the user address space.  The odd case here is WRUSS,
	 * which, according to the preliminary documentation, does not respect
	 * SMAP and will have the USER bit set so, in all cases, SMAP
	 * enforcement appears to be consistent with the USER bit.
	 */
	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
		     !(hw_error_code & X86_PF_USER) &&
		     !(regs->flags & X86_EFLAGS_AC)))
	{
   
   //smap开启且错误不是在用户模式下且寄存器编制为没有设置对齐标志
		bad_area_nosemaphore(regs, hw_error_code, address);//传入页错误寄存器信息、硬件错误码、页错误地址
		return;
	}

	/*
	 * If we're in an interrupt, have no user context or are running
	 * in a region with pagefaults disabled then we must not take the fault
	 */
	if (unlikely(faulthandler_disabled() || !mm)) {
   
   //faulthandle被禁用不能处理页错误或者当前mm为null
		bad_area_nosemaphore(regs, hw_error_code, address);//处理页错误
		return;
	}

	/*
	 * It's safe to allow irq's after cr2 has been saved and the
	 * vmalloc fault has been handled.
	 *
	 * User-mode registers count as a user access even for any
	 * potential system fault or CPU buglet:
	 */
	if (user_mode(regs)) {
   
   //是否在用户态下
		local_irq_enable();//cpu可以响应外部中断
		flags |= FAULT_FLAG_USER;//页错误添加用户标志
	} else {
   
   //内核模式
		if (regs->flags & X86_EFLAGS_IF)//检查寄存器是否开启中断标志位
			local_irq_enable();//启动中断
	}

	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

	if (hw_error_code & X86_PF_WRITE)//错误是写入内存引起
		flags |= FAULT_FLAG_WRITE;
	if (hw_error_code & X86_PF_INSTR)//错误是执行指令引起
		flags |= FAULT_FLAG_INSTRUCTION;

#ifdef CONFIG_X86_64
	/*
	 * Instruction fetch faults in the vsyscall page might need
	 * emulation.  The vsyscall page is at a high address
	 * (>PAGE_OFFSET), but is considered to be part of the user
	 * address space.
	 *
	 * The vsyscall page does not have a "real" VMA, so do this
	 * emulation before we go searching for VMAs.
	 */
	if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
   
   
		if (emulate_vsyscall(regs, address))//页错误地址是在vsyscall虚拟地址范围内，emulate_vsyscall系统调用传入寄存器信息以及页错误地址
			return;
	}
#endif

	/*
	 * Kernel-mode access to the user address space should only occur
	 * on well-defined single instructions listed in the exception
	 * tables.  But, an erroneous kernel fault occurring outside one of
	 * those areas which also holds mmap_sem might deadlock attempting
	 * to validate the fault against the address space.
	 *
	 * Only do the expensive exception table search when we might be at
	 * risk of a deadlock.  This happens if we
	 * 1. Failed to acquire mmap_sem, and
	 * 2. The access did not originate in userspace.
	 */////尝试获取mmap_sem的读锁，这是一个保护内存映射区域的信号量。 
	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
   
   
		if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
   
   //不在用户模式下且没有在异常表中找到对应的处理函数 
			/*
			 * Fault from code in kernel from
			 * which we do not expect faults.
			 */
			bad_area_nosemaphore(regs, hw_error_code, address

最低0.47元/天解锁文章