/*
* This struct defines a memory VMM memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
* space that has a special rule for the page-fault handlers (ie a shared
* library, the executable area etc).
*/structvm_area_struct{/* The first cache line has the info for VMA tree walking. */unsignedlongvm_start;/* 在虚拟地址空间的起始位置 */unsignedlongvm_end;/* 在虚拟地址空间的结束位置*//* linked list of VM areas per task, sorted by address */structvm_area_struct*vm_next,*vm_prev;/* 虚拟内存区域链表中的前继,后继指针 */structrb_nodevm_rb;/*
* Largest free memory gap in bytes to the left of this VMA.
* Either between this VMA and vma->vm_prev, or between one of the
* VMAs below us in the VMA rbtree and its ->vm_prev. This helps
* get_unmapped_area find a free area of the right size.
*/unsignedlongrb_subtree_gap;/* Second cache line starts here. *//* Function pointers to deal with this struct. */conststructvm_operations_struct*vm_ops;/* 虚拟内存操作集合 */structmm_struct*vm_mm;/* vma所属的虚拟地址空间 */pgprot_tvm_page_prot;/* Access permissions of this VMA. */unsignedlongvm_flags;/* Flags, see mm.h. */unsignedlongvm_pgoff;/* 以Page为单位的偏移. */structfile*vm_file;/* 映射的文件,匿名映射即为nullptr*/
下图是某个进程的虚拟内存简化布局以及相应的几个数据结构之间的关系:
mmap映射执行流程
检查参数,并根据传入的映射类型设置 vma 的flags.
进程查找其虚拟地址空间,找到一块空闲的满足要求的虚拟地址空间.
根据找到的虚拟地址空间初始化 vma.
设置 vma->vm_file.
根据文件系统类型,将 vma->vm_ops 设为对应的 file_operations.
将 vma 插入 mm 的链表中.
源码分析
我们接下来进入 mmap 的代码分析:
do_mmap()
do_mmap() 是整个 mmap() 的具体操作函数, 我们跳过系统调用来直接看具体实现:
unsignedlongdo_mmap(structfile*file,unsignedlongaddr,unsignedlonglen,unsignedlongprot,unsignedlongflags,vm_flags_tvm_flags,unsignedlongpgoff,unsignedlong*populate,structlist_head*uf){structmm_struct*mm=current->mm;/* 获取该进程的memory descriptor
int pkey = 0;
*populate = 0;
/*
函数对传入的参数进行一系列检查, 假如任一参数出错,都会返回一个errno
*/if(!len)return-EINVAL;/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/if((prot&PROT_READ)&&(current->personality&READ_IMPLIES_EXEC))if(!(file&&path_noexec(&file->f_path)))prot|=PROT_EXEC;/* force arch specific MAP_FIXED handling in get_unmapped_area */if(flags&MAP_FIXED_NOREPLACE)flags|=MAP_FIXED;/* 假如没有设置MAP_FIXED标志,且addr小于mmap_min_addr, 因为可以修改addr, 所以就需要将addr设为mmap_min_addr的页对齐后的地址 */if(!(flags&MAP_FIXED))addr=round_hint_to_min(addr);/* Careful about overflows.. *//* 进行Page大小的对齐 */len=PAGE_ALIGN(len);if(!len)return-ENOMEM;/* offset overflow? */if((pgoff+(len>>PAGE_SHIFT))<pgoff)return-EOVERFLOW;/* Too many mappings? *//* 判断该进程的地址空间的虚拟区间数量是否超过了限制 */if(mm->map_count>sysctl_max_map_count)return-ENOMEM;/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*//* get_unmapped_area从当前进程的用户空间获取一个未被映射区间的起始地址 */addr=get_unmapped_area(file,addr,len,pgoff,flags);/* 检查addr是否有效 */if(offset_in_page(addr))returnaddr;/* 假如flags设置MAP_FIXED_NOREPLACE,需要对进程的地址空间进行addr的检查. 如果搜索发现存在重合的vma, 返回-EEXIST。
这是MAP_FIXED_NOREPLACE标志所要求的
*/if(flags&MAP_FIXED_NOREPLACE){structvm_area_struct*vma=find_vma(mm,addr);if(vma&&vma->vm_start<addr+len)return-EEXIST;}if(prot==PROT_EXEC){pkey=execute_only_pkey(mm);if(pkey<0)pkey=0;}/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/vm_flags|=calc_vm_prot_bits(prot,pkey)|calc_vm_flag_bits(flags)|mm->def_flags|VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC;/* 假如flags设置MAP_LOCKED,即类似于mlock()将申请的地址空间锁定在内存中, 检查是否可以进行lock*/if(flags&MAP_LOCKED)if(!can_do_mlock())return-EPERM;if(mlock_future_check(mm,vm_flags,len))return-EAGAIN;if(file){/* file指针不为nullptr, 即从文件到虚拟空间的映射 */structinode*inode=file_inode(file);/* 获取文件的inode */unsignedlongflags_mask;if(!file_mmap_ok(file,inode,pgoff,len))return-EOVERFLOW;flags_mask=LEGACY_MAP_MASK|file->f_op->mmap_supported_flags;/*
...
根据标志指定的map种类,把为文件设置的访问权考虑进去。
如果所请求的内存映射是共享可写的,就要检查要映射的文件是为写入而打开的,而不
是以追加模式打开的,还要检查文件上没有上强制锁。
对于任何种类的内存映射,都要检查文件是否为读操作而打开的。
...
*/}else{switch(flags&MAP_TYPE){caseMAP_SHARED:if(vm_flags&(VM_GROWSDOWN|VM_GROWSUP))return-EINVAL;/*
* Ignore pgoff.
*/pgoff=0;vm_flags|=VM_SHARED|VM_MAYSHARE;break;caseMAP_PRIVATE:/*
* Set pgoff according to addr for anon_vma.
*/pgoff=addr>>PAGE_SHIFT;break;default:return-EINVAL;}}/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/if(flags&MAP_NORESERVE){/* We honor MAP_NORESERVE if allowed to overcommit */if(sysctl_overcommit_memory!=OVERCOMMIT_NEVER)vm_flags|=VM_NORESERVE;/* hugetlb applies strict overcommit unless MAP_NORESERVE */if(file&&is_file_hugepages(file))vm_flags|=VM_NORESERVE;}addr=mmap_region(file,addr,len,vm_flags,pgoff,uf);if(!IS_ERR_VALUE(addr)&&((vm_flags&VM_LOCKED)||(flags&(MAP_POPULATE|MAP_NONBLOCK))==MAP_POPULATE))*populate=len;returnaddr;
unsignedlongmmap_region(structfile*file,unsignedlongaddr,unsignedlonglen,vm_flags_tvm_flags,unsignedlongpgoff,structlist_head*uf){structmm_struct*mm=current->mm;// 获取该进程的memory descriptor
structvm_area_struct*vma,*prev;interror;structrb_node**rb_link,*rb_parent;unsignedlongcharged=0;/* Check against address space limit. *//* 检查申请的虚拟内存空间是否超过了限制. */if(!may_expand_vm(mm,vm_flags,len>>PAGE_SHIFT)){unsignedlongnr_pages;/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/nr_pages=count_vma_pages_range(mm,addr,addr+len);if(!may_expand_vm(mm,vm_flags,(len>>PAGE_SHIFT)-nr_pages))return-ENOMEM;}/* 检查[addr, addr+len)的区间是否存在映射空间,假如存在重合的映射空间需要munmap */while(find_vma_links(mm,addr,addr+len,&prev,&rb_link,&rb_parent)){if(do_munmap(mm,addr,len,uf))return-ENOMEM;}/*
* Private writable mapping: check memory availability
*/if(accountable_mapping(file,vm_flags)){charged=len>>PAGE_SHIFT;if(security_vm_enough_memory_mm(mm,charged))return-ENOMEM;vm_flags|=VM_ACCOUNT;}/* 检查是否可以合并[addr, addr+len)区间内的虚拟地址空间vma*/vma=vma_merge(mm,prev,addr,addr+len,vm_flags,NULL,file,pgoff,NULL,NULL_VM_UFFD_CTX);if(vma)/* 假如合并成功,即使用合并后的vma, 并跳转至out */gotoout;/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*//* 如果不能和已有的虚拟内存区域合并,通过 Memory Descriptor 来申请一个 vma */vma=vm_area_alloc(mm);if(!vma){error=-ENOMEM;gotounacct_error;}/* 初始化 vma */vma->vm_start=addr;vma->vm_end=addr+len;vma->vm_flags=vm_flags;vma->vm_page_prot=vm_get_page_prot(vm_flags);vma->vm_pgoff=pgoff;if(file){/* 假如指定了文件映射 */if(vm_flags&VM_DENYWRITE){/* 映射的文件不允许写入,调用 deny_write_accsess(file) 排斥常规的文件操作 */error=deny_write_access(file);if(error)gotofree_vma;}if(vm_flags&VM_SHARED){/* 映射的文件允许其他进程可见, 标记文件为可写 */error=mapping_map_writable(file->f_mapping);if(error)gotoallow_write_and_free_vma;}/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/vma->vm_file=get_file(file);/* 递增 File 的引用次数,返回 File 赋给 vma */error=call_mmap(file,vma);/* 调用文件系统指定的 mmap 函数,后面会介绍 */if(error)gotounmap_and_free_vma;/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/WARN_ON_ONCE(addr!=vma->vm_start);addr=vma->vm_start;vm_flags=vma->vm_flags;}elseif(vm_flags&VM_SHARED){/* 假如标志为 VM_SHARED,但没有指定映射文件,需要调用 shmem_zero_setup()
shmem_zero_setup() 实际映射的文件是 dev/zero
*/error=shmem_zero_setup(vma);if(error)gotofree_vma;}else{/* 既没有指定 file, 也没有设置 VM_SHARED, 即设置为匿名映射 */vma_set_anonymous(vma);}/* 将申请的新 vma 加入 mm 中的 vma 链表*/vma_link(mm,vma,prev,rb_link,rb_parent);/* Once vma denies write, undo our temporary denial count */if(file){if(vm_flags&VM_SHARED)mapping_unmap_writable(file->f_mapping);if(vm_flags&VM_DENYWRITE)allow_write_access(file);}file=vma->vm_file;out:perf_event_mmap(vma);/* 更新进程的虚拟地址空间 mm */vm_stat_account(mm,vm_flags,len>>PAGE_SHIFT);if(vm_flags&VM_LOCKED){if((vm_flags&VM_SPECIAL)||vma_is_dax(vma)||is_vm_hugetlb_page(vma)||vma==get_gate_vma(current->mm))vma->vm_flags&=VM_LOCKED_CLEAR_MASK;elsemm->locked_vm+=(len>>PAGE_SHIFT);}if(file)uprobe_mmap(vma);/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/vma->vm_flags|=VM_SOFTDIRTY;vma_set_page_prot(vma);returnaddr;unmap_and_free_vma:vma->vm_file=NULL;fput(file);/* Undo any partial mapping done by a device driver. */unmap_region(mm,vma,prev,vma->vm_start,vma->vm_end);charged=0;if(vm_flags&VM_SHARED)mapping_unmap_writable(file->f_mapping);allow_write_and_free_vma:if(vm_flags&VM_DENYWRITE)allow_write_access(file);free_vma:vm_area_free(vma);unacct_error:if(charged)vm_unacct_memory(charged);returnerror;}
staticconststructvm_operations_structext4_file_vm_ops={.fault=ext4_filemap_fault,.map_pages=filemap_map_pages,.page_mkwrite=ext4_page_mkwrite,};staticintext4_file_mmap(structfile*file,structvm_area_struct*vma){structinode*inode=file->f_mapping->host;if(unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))return-EIO;/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/if(!IS_DAX(file_inode(file))&&(vma->vm_flags&VM_SYNC))return-EOPNOTSUPP;file_accessed(file);if(IS_DAX(file_inode(file))){vma->vm_ops=&ext4_dax_vm_ops;vma->vm_flags|=VM_HUGEPAGE;}else{vma->vm_ops=&ext4_file_vm_ops;}return0;}