准备

内核版本: 4.20.1

在 Linux 内核中内存是有限的,而为了最大限度的利用内存内核采取了分页的机制: 进程被分配虚拟地址空间,虚拟地址空间映射真实的物理内存,进程的数据页并不是全部加载至物理内存的地址空间之中,只有当用户访问的数据页不在物理内存之中,通过请页机制进行加载数据页至物理内存.

mmap的内核实现中,我们可以发现mmap仅仅建立了进程的虚拟地址空间与物理内存的映射,当访问的数据页不在物理内存之中时,触发缺页中断,这里就用到了请页机制. 在硬件层面,当 CPU 访问的数据页不在物理内存中,CPU 就会触发缺页中断,通知内核进行处理.

请页机制涉及中断处理的逻辑,关于中断机制这部分我们这里略过.

分析

我们以最常用的x86体系结构来分析: do_page_fault()

1
2
3
4
5
6
7
8
9
10
11
12
dotraplinkage void notrace
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
/* CR2 寄存器中包含有最新的页错误发生时的虚拟地址 */
unsigned long address = read_cr2(); /* Get the faulting address */
/*
...
*/
__do_page_fault(regs, error_code, address); /* 处理缺页中断 */
exception_exit(prev_state);
}
NOKPROBE_SYMBOL(do_page_fault);

__do_page_fault:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);

if (unlikely(kmmio_fault(regs, address)))
return;

/*
* Was the fault on kernel-controlled part of the address space?
* 检查 address 来判断地址属于内核态还是用户态
*/
if (unlikely(fault_in_kernel_space(address)))
/* 处理内核态的缺页中断 */
do_kern_addr_fault(regs, hw_error_code, address);
else
/* 处理用户态的缺页中断 */
do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(__do_page_fault);

用户态的缺页中断处理

do_user_addr_fault()传入的hw_error_code是页的错误码, 下面是其中的含义:

1
2
3
4
5
6
7
8
9
10
/*
* Page fault error code bits:
*
* bit 0 == 0: no page found 1: protection fault
* bit 1 == 0: read access 1: write access
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
* bit 5 == 1: protection keys block access
*/

do_user_addr_fault具体实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/* Handle faults in the user portion of the address space */
static inline
void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
tsk = current;
mm = tsk->mm; /* 获取该进程的内存描述符 mm */
/* ... */
/*
* hw_error_code is literally the "page fault error code" passed to
* the kernel directly from the hardware. But, we will shortly be
* modifying it in software, so give it a new name.
*/
sw_error_code = hw_error_code;

/* ... */

vma = find_vma(mm, address); /* 通过 address 在内存描述符 mm 中查找 vma */
if (unlikely(!vma)) { /* 假如不存在 */
bad_area(regs, sw_error_code, address); /* 访问非法地址 */
return;
}
if (likely(vma->vm_start <= address)) /* 访问合法地址,跳转至 good_area */
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
bad_area(regs, sw_error_code, address);
return;
}
if (sw_error_code & X86_PF_USER) {
/*
* Accessing the stack below %sp is always a bug.
* The large cushion allows instructions like enter
* and pusha to work. ("enter $65535, $31" pushes
* 32 pointers and then decrements %sp by 65535.)
*/
/* 访问了越界的栈空间 */
if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
bad_area(regs, sw_error_code, address);
return;
}
}
if (unlikely(expand_stack(vma, address))) {
bad_area(regs, sw_error_code, address);
return;
}

首先尝试通过该进程的内存描述符mm获取vma即虚拟内存区域,假如不存在,则说明访问了非法的虚拟地址,返回bad_area(),同样假如是越界错误或者段权限错误也返回bad_area().

假如访问的地址是合法,会跳转至good_area:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
good_area:
if (unlikely(access_error(sw_error_code, vma))) { // 根据页的错误类型与vma的访问权限是否匹配
bad_area_access_error(regs, sw_error_code, address, vma);
return;
}

/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*
* Note that handle_userfault() may also release and reacquire mmap_sem
* (and not return with VM_FAULT_RETRY), when returning to userland to
* repeat the page fault later with a VM_FAULT_NOPAGE retval
* (potentially after handling any pending signal during the return to
* userland). The return to userland is identified whenever
* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
*/
fault = handle_mm_fault(vma, address, flags); /* 处理缺页的具体实现 */
major |= fault & VM_FAULT_MAJOR;

/* ... */

check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(do_user_addr_fault);

如果这个虚拟区的访问权限与引起错误的访问类型相匹配,假如是 Huge Page (大页)的缺页中断,则调用handle_mm_fault()函数,而handle_mm_fault()调用__handle_mm_fault()完成具体操作:

__handle_mm_fault()

mm/memory.c

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;

pgd = pgd_offset(mm, address); /* 返回指定的 mm 的全局目录项的指针 */
p4d = p4d_alloc(mm, pgd, address); /* 在 X86 的4 级页面机制中,不做任何操作,直接返回 pgd */
if (!p4d)
return VM_FAULT_OOM;

vmf.pud = pud_alloc(mm, p4d, address); /* 创建并分配一个 Page Upper Directory 指针 */
/* ... */
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
/* ... */
/* 根据 vmf 决定如何分配一个新的页面 */
return handle_pte_fault(&vmf);
}

handle_pte_fault() 分配页面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;

if (unlikely(pmd_none(*vmf->pmd))) {
/* 页中间目录不存在,即页表也为空 */
vmf->pte = NULL;
} else {
/* ... */
/* 页中间目录存在,通过 address 尝试获取页表(Page Table) */
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
* CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
* accesses. The code below just needs a consistent view
* for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();
if (pte_none(vmf->orig_pte)) {
/* 假如页中间目录存在,但页表不存在, vmf->pte置为NULL */
pte_unmap(vmf->pte);
vmf->pte = NULL;
}
}

/* 假如 vmf->pte 为空,即尚未分配为缺失的页分配页表(Page Table) */
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
/* 处理匿名文件映射的缺页 */
return do_anonymous_page(vmf);
else
/* 处理文件映射的缺页 */
return do_fault(vmf);
}

/* 页表已经建立,但不存在于物理内存之中 */
if (!pte_present(vmf->orig_pte))
/* 从磁盘交换区换入物理内存 */
return do_swap_page(vmf);

if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);

vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
/* 页表已经建立,且也贮存在物理内存中,因为写操作触发了缺页中断,即为 COW 的缺页中断 */
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
/* 处理Copy On Write的Write部分的缺页中断 */
return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}

/* ... */
}

缺页处理的具体流程:

  • 访问的页表 Page Table 尚未分配:

    当页表从未被访问时,有两种方法装入所缺失的页,这取决于这个页是否被映射到磁盘文件:

    vma->vm_ops不为NULL

    vma对应磁盘上某一个文件,调用vma->vm_ops->fault(vmf).

    vma->vm_ops为NULL

    vma没有对应磁盘的文件为匿名映射,调用do_anonymous_page(vmf)分配页面.

    • 处理只读的缺页: do_read_fault()

      即根据文件系统设置的vma的缺页处理函数,在EXT4文件系统中,对应的是ext4_filemap_fault(),其逻辑就是读文件: 先从 Page Cache 中查找,假如不存在,从文件上读取至 Page Cache.

    • 处理写时复制的缺页: do_cow_fault()

    • 处理共享页的缺页: do_shared_fault()

  • 访问的页表已经分配,但保存在 swap 交换区:

    do_swap_page()

  • 访问的页表已经分配,且存在于物理内存中,即触发写时复制(COW)的缺页中断:

    do_wp_page()

    写时复制的概念不多做介绍,仅仅来看 Linux 内核是如何处理写时复制:

    • vma申请一个 Page.
    • 调用vma->vm_ops->fault(vmf)读取数据.
    • 将函数把旧页面的内容复制到新分配的页面.

总结

通过分析我们可以发现 mmap 仅仅创建了虚拟地址空间 vma, 当读写数据页时通过缺页中断后的处理建立页表或更新页表, 才真正将分配物理内存. 物理内存的分配涉及 Linux 内核中的伙伴算法和 slab 分配器,即缺页中断后 Linux 内核会为vma分配一个物理页帧,然后通过不同的缺页处理逻辑来完成页面的请求过程. 但其实vma结构体中并没有数据成员物理页帧Page,因为可以通过vma的虚拟地址来转换到物理页面的实际地址,所以并不需要一个直接的关联.