linux内存管理之sys_brk实现分析【二】
4 sbrk()系统调用代码分析// sbrk:用来扩大或者缩小进程的数据段边界,brk为新的数据段边界,其函数实现在文件/mm/mmap.c中。函数原型如下:[cpp]view plaincopyprint?SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long
4 sbrk()系统调用代码分析
// sbrk:用来扩大或者缩小进程的数据段边界,brk为新的数据段边界,其函数实现在文件/mm/mmap.c中。
函数原型如下:
- SYSCALL_DEFINE1(brk, unsigned long, brk)
- {
- unsigned long rlim, retval;
- unsigned long newbrk, oldbrk;
- struct mm_struct *mm = current->mm;
- unsigned long min_brk;
- //写信号量获取操作, 得到读写信号量sem, 将直接将文件映射到内存
- down_write(&mm->mmap_sem);
- #ifdef CONFIG_COMPAT_BRK
- min_brk = mm->end_code;
- #else
- min_brk = mm->start_brk;
- #endif
- if (brk < min_brk)
- goto out;
- /*
- * Check against rlimit here. If this check is done later after the test
- * of oldbrk with newbrk then it can escape the test and let the data
- * segment grow beyond its set limit the in case where the limit is
- * not page aligned -Ram Gupta
- */
- //参数有效性判断。
- //代码段非法访问
- rlim = rlimit(RLIMIT_DATA); //RLIMIT_DATA以字节表示的data()段限制
- if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
- goto out;
- //页框对齐
- newbrk = PAGE_ALIGN(brk);
- oldbrk = PAGE_ALIGN(mm->brk);
- if (oldbrk == newbrk)
- goto set_brk;
- /* Always allow shrinking brk. */
- //如果新边界比现在的边界要小,那说明要执行收缩操作
- //缩短堆
- if (brk <= mm->brk) {
- if (!do_munmap(mm, newbrk, oldbrk-newbrk))
- goto set_brk;
- goto out;
- }
- /* Check against existing mmap mappings. */
- //伸展空间已经有映射了
- if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
- goto out;
- /* Ok, looks good - let it rip. */
- //执行伸长操作
- if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
- goto out;
- set_brk:
- mm->brk = brk;
- out:
- retval = mm->brk;
- up_write(&mm->mmap_sem);
- return retval;
- }
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long rlim, retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
//写信号量获取操作, 得到读写信号量sem, 将直接将文件映射到内存
down_write(&mm->mmap_sem);
#ifdef CONFIG_COMPAT_BRK
min_brk = mm->end_code;
#else
min_brk = mm->start_brk;
#endif
if (brk < min_brk)
goto out;
/*
* Check against rlimit here. If this check is done later after the test
* of oldbrk with newbrk then it can escape the test and let the data
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
//参数有效性判断。
//代码段非法访问
rlim = rlimit(RLIMIT_DATA); //RLIMIT_DATA以字节表示的data()段限制
if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
(mm->end_data - mm->start_data) > rlim)
goto out;
//页框对齐
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
if (oldbrk == newbrk)
goto set_brk;
/* Always allow shrinking brk. */
//如果新边界比现在的边界要小,那说明要执行收缩操作
//缩短堆
if (brk <= mm->brk) {
if (!do_munmap(mm, newbrk, oldbrk-newbrk))
goto set_brk;
goto out;
}
/* Check against existing mmap mappings. */
//伸展空间已经有映射了
if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
goto out;
/* Ok, looks good - let it rip. */
//执行伸长操作
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
goto out;
set_brk:
mm->brk = brk;
out:
retval = mm->brk;
up_write(&mm->mmap_sem);
return retval;
}
由于这个函数既可以用来分配空间,即把动态分配区地步的边界往上推;也可以用来释放,即归还空间。因此,它的代码也大致可以分为两部分。首先是第一部分:收缩数据区,伸长操作。我们分为两种情况来分析。
4.1 用户空间的收缩
4.1.1 do_munmap
从上面的代码我们可以看出。用户空间的收缩操作相应的接口是:do_munmap()。代码如下:
- /* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
- int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
- {
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
- if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
- return -EINVAL;
- /* Find the first overlapping VMA */
- //找到第一个结束地址大于start的VMA。Prev是前一个VMA
- vma = find_vma_prev(mm, start, &prev);
- if (!vma)
- return 0;
- /* we have start < vma->vm_end */
- /* if it doesn't overlap, we have nothing.. */
- //现在的堆尾点不可能落在空洞里
- //start:新的边界地址。Len:收缩的长度。Start+len即为旧的边界地址。
- //所以 start+len肯定是属于进程的线性地址
- end = start + len;
- if (vma->vm_start >= end)
- return 0;
- /*
- * If we need to split any vma, do it now to save pain later.
- * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
- * unmapped vm_area_struct will remain in use: so lower split_vma
- * places tmp vma above, and higher split_vma places tmp vma below.
- */
- //如果start大于mpnt的起始地址,就会把mpnt一分为二
- if (start > vma->vm_start) {
- int error;
- /*
- * Make sure that map_count on return from munmap() will
- * not exceed its limit; but let map_count go just above
- * its limit temporarily, to help free resources as expected.
- */
- if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
- error = __split_vma(mm, vma, start, 0);
- if (error)
- return error;
- prev = vma;
- }
- /* Does it split the last one? */
- //找到最后的一个vma
- last = find_vma(mm, end);
- //把最后一个线性区一分为二的情况
- if (last && end > last->vm_start) {
- int error = __split_vma(mm, last, end, 1);
- if (error)
- return error;
- }
- vma = prev? prev->vm_next: mm->mmap;
- /*
- * unlock any mlock()ed ranges before detaching vmas
- */
- if (mm->locked_vm) {
- struct vm_area_struct *tmp = vma;
- while (tmp && tmp->vm_start < end) {
- if (tmp->vm_flags & VM_LOCKED) {
- mm->locked_vm -= vma_pages(tmp);
- munlock_vma_pages_all(tmp);
- }
- tmp = tmp->vm_next;
- }
- }
- /*
- * Remove the vma's, and unmap the actual pages
- */
- //将mpnt对的区间vma从进程描述符组中删除
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
- //更新页表项,释放页框
- unmap_region(mm, vma, prev, start, end);
- /* Fix up all other VM information */
- //到现在为止,所有要释放的vma都挂在mpnt上。remove_vma_list为对要删除的vma链的处理
- remove_vma_list(mm, vma);
- return 0;
- }
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
if ((len = PAGE_ALIGN(len)) == 0)
return -EINVAL;
/* Find the first overlapping VMA */
//找到第一个结束地址大于start的VMA。Prev是前一个VMA
vma = find_vma_prev(mm, start, &prev);
if (!vma)
return 0;
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
//现在的堆尾点不可能落在空洞里
//start:新的边界地址。Len:收缩的长度。Start+len即为旧的边界地址。
//所以 start+len肯定是属于进程的线性地址
end = start + len;
if (vma->vm_start >= end)
return 0;
/*
* If we need to split any vma, do it now to save pain later.
* Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
//如果start大于mpnt的起始地址,就会把mpnt一分为二
if (start > vma->vm_start) {
int error;
/*
* Make sure that map_count on return from munmap() will
* not exceed its limit; but let map_count go just above
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
return -ENOMEM;
error = __split_vma(mm, vma, start, 0);
if (error)
return error;
prev = vma;
}
/* Does it split the last one? */
//找到最后的一个vma
last = find_vma(mm, end);
//把最后一个线性区一分为二的情况
if (last && end > last->vm_start) {
int error = __split_vma(mm, last, end, 1);
if (error)
return error;
}
vma = prev? prev->vm_next: mm->mmap;
/*
* unlock any mlock()ed ranges before detaching vmas
*/
if (mm->locked_vm) {
struct vm_area_struct *tmp = vma;
while (tmp && tmp->vm_start < end) {
if (tmp->vm_flags & VM_LOCKED) {
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
tmp = tmp->vm_next;
}
}
/*
* Remove the vma's, and unmap the actual pages
*/
//将mpnt对的区间vma从进程描述符组中删除
detach_vmas_to_be_unmapped(mm, vma, prev, end);
//更新页表项,释放页框
unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
//到现在为止,所有要释放的vma都挂在mpnt上。remove_vma_list为对要删除的vma链的处理
remove_vma_list(mm, vma);
return 0;
}
4.1.1.1 __split_vma
为了弄清楚收缩的整个过程,有必要详细的分析一下函数所调用的各个子函数。
__split_vma:将一个vma劈为成两个:
//参数含义:
//mm:进程的内存描述符 vma:要劈分的vma addr:为界线地址 new_below:为0时,vma为下一半 为1时,//vma为上一半
- /*
- * __split_vma() bypasses sysctl_max_map_count checking. We use this on the
- * munmap path where it doesn't make sense to fail.
- */
- static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
- unsigned long addr, int new_below)
- {
- struct mempolicy *pol;
- struct vm_area_struct *new;
- int err = -ENOMEM;
- //如果进程的vma总数超过了限制值 zlh
- if (is_vm_hugetlb_page(vma) && (addr &
- ~(huge_page_mask(hstate_vma(vma)))))
- return -EINVAL;
- //新申请一个vma
- new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!new)
- goto out_err;
- //将新的vma赋值为旧的vma,使其两者相等
- /* most fields are the same, copy all, and then fixup */
- *new = *vma;
- INIT_LIST_HEAD(&new->anon_vma_chain);
- //new_below为1的时候,vma为上一半,对应的new为下一半
- if (new_below)
- new->vm_end = addr;
- else {
- new->vm_start = addr;
- new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
- }
- pol = mpol_dup(vma_policy(vma));
- if (IS_ERR(pol)) {
- err = PTR_ERR(pol);
- goto out_free_vma;
- }
- vma_set_policy(new, pol);
- if (anon_vma_clone(new, vma))
- goto out_free_mpol;
- if (new->vm_file) {
- get_file(new->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
- //如果定义了open操作
- if (new->vm_ops && new->vm_ops->open)
- new->vm_ops->open(new);
- //经过前面的初始化之后,再由vma_adjust调整vma的边界
- if (new_below)
- err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
- ((addr - new->vm_start) >> PAGE_SHIFT), new);
- else
- err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
- /* Success. */
- if (!err)
- return 0;
- /* Clean everything up if vma_adjust failed. */
- //如果调整失败,清理工作
- if (new->vm_ops && new->vm_ops->close)
- new->vm_ops->close(new);
- if (new->vm_file) {
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
- fput(new->vm_file);
- }
- unlink_anon_vmas(new);
- out_free_mpol:
- mpol_put(pol);
- out_free_vma:
- kmem_cache_free(vm_area_cachep, new);
- out_err:
- return err;
- }
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
* munmap path where it doesn't make sense to fail.
*/
static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long addr, int new_below)
{
struct mempolicy *pol;
struct vm_area_struct *new;
int err = -ENOMEM;
//如果进程的vma总数超过了限制值 zlh
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
return -EINVAL;
//新申请一个vma
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
goto out_err;
//将新的vma赋值为旧的vma,使其两者相等
/* most fields are the same, copy all, and then fixup */
*new = *vma;
INIT_LIST_HEAD(&new->anon_vma_chain);
//new_below为1的时候,vma为上一半,对应的new为下一半
if (new_below)
new->vm_end = addr;
else {
new->vm_start = addr;
new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
}
pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol)) {
err = PTR_ERR(pol);
goto out_free_vma;
}
vma_set_policy(new, pol);
if (anon_vma_clone(new, vma))
goto out_free_mpol;
if (new->vm_file) {
get_file(new->vm_file);
if (vma->vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
}
//如果定义了open操作
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
//经过前面的初始化之后,再由vma_adjust调整vma的边界
if (new_below)
err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
((addr - new->vm_start) >> PAGE_SHIFT), new);
else
err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
/* Success. */
if (!err)
return 0;
/* Clean everything up if vma_adjust failed. */
//如果调整失败,清理工作
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
if (new->vm_file) {
if (vma->vm_flags & VM_EXECUTABLE)
removed_exe_file_vma(mm);
fput(new->vm_file);
}
unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
out_free_vma:
kmem_cache_free(vm_area_cachep, new);
out_err:
return err;
}
4.1.1.1.1 vma_adjust
转入vma_adjust(),它用来完成调整vma的起始边界和结束边界,将新的vma,插入到进程的vma链等操作,函数原型为:
- /*
- * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
- * is already present in an i_mmap tree without adjusting the tree.
- * The following helper function should be used when such adjustments
- * are necessary. The "insert" vma (if any) is to be inserted
- * before we drop the necessary locks.
- */
- int vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
* The following helper function should be used when such adjustments
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
int vma_adjust(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
4.1.1.2 detach_vmas_to_be_unmapped
第二个要为析的函数是:detach_vmas_to_be_unmapped()
它主要是将要删除的vma链到一起,同时将要删除的vma从mm中脱链
//参数说明:
/*
Mm: 进程的内存描述符
Vma:要删除的起始vma
Prev:vma的前一个vma区
End:结束地址
*/
- /*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
- static void
- detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
- {
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
- unsigned long addr;
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
- //从红黑树中释放掉vma
- rb_erase(&vma->vm_rb, &mm->mm_rb);
- //更新vma计数
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
- //将要删除的vma从链表中脱落
- *insertion_point = vma;
- if (vma)
- vma->vm_prev = prev;
- //最后元素后向指针置NULL
- tail_vma->vm_next = NULL;
- if (mm->unmap_area == arch_unmap_area)
- addr = prev ? prev->vm_end : mm->mmap_base;
- else
- addr = vma ? vma->vm_start : mm->mmap_base;
- mm->unmap_area(mm, addr);
- //由于进行了删除操作。mmap_cache失效了,置NULL
- mm->mmap_cache = NULL; /* Kill the cache. */
- }
/*
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
static void
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
struct vm_area_struct **insertion_point;
struct vm_area_struct *tail_vma = NULL;
unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
//从红黑树中释放掉vma
rb_erase(&vma->vm_rb, &mm->mm_rb);
//更新vma计数
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
//将要删除的vma从链表中脱落
*insertion_point = vma;
if (vma)
vma->vm_prev = prev;
//最后元素后向指针置NULL
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
else
addr = vma ? vma->vm_start : mm->mmap_base;
mm->unmap_area(mm, addr);
//由于进行了删除操作。mmap_cache失效了,置NULL
mm->mmap_cache = NULL; /* Kill the cache. */
}
4.1.1.3 remove_vma_list
接下来要分析的调用函数是remove_vma_list()
它主要对删除的vma链进行处理。具体代码如下示:
//参数说明:
//mm:进程的内存描述符
//vma:要删除的链表的头节点
- /*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
- *
- * Called with the mm semaphore held.
- */
- static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
- {
- /* Update high watermark before we lower total_vm */
- //更新mm的total_vm
- update_hiwater_vm(mm);
- do {
- long nrpages = vma_pages(vma);
- mm->total_vm -= nrpages;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
- vma = remove_vma(vma);
- } while (vma);
- validate_mm(mm);
- }
/*
* Ok - we have the memory areas we should free on the vma list,
* so release them, and do the vma updates.
*
* Called with the mm semaphore held.
*/
static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
{
/* Update high watermark before we lower total_vm */
//更新mm的total_vm
update_hiwater_vm(mm);
do {
long nrpages = vma_pages(vma);
mm->total_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
validate_mm(mm);
}
4.1.1.3.1 update_hiwater_vm
update_hiwater_vm() 函数定义在mm.h中,原型为:
- static inline void update_hiwater_vm(struct mm_struct *mm)
- {
- if (mm->hiwater_vm < mm->total_vm)
- mm->hiwater_vm = mm->total_vm;
- }
static inline void update_hiwater_vm(struct mm_struct *mm)
{
if (mm->hiwater_vm < mm->total_vm)
mm->hiwater_vm = mm->total_vm;
}
4.1.1.3.2 vma_pages
vma_pages()函数,是对vma调整的封装:
- static inline unsigned long vma_pages(struct vm_area_struct *vma)
- {
- return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
- }
static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
4.1.1.4 unmap_region
unmap_region是整个收缩过程中的核心,它主要完成相应项表项的修改,具体映射页框的释放。
代码如下:
- /*
- * Get rid of page table information in the indicated region.
- *
- * Called with the mm semaphore held.
- */
- static void unmap_region(struct mm_struct *mm,
- struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end)
- {
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
- struct mmu_gather *tlb;
- unsigned long nr_accounted = 0;
- lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
- update_hiwater_rss(mm);
- //断开具体的vma映射
- unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
- vm_unacct_memory(nr_accounted);
- //因为删除了一些映射,会造成一个页表空闲的情况,回收页表项所占的空间
- free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
- next? next->vm_start: 0);
- tlb_finish_mmu(tlb, start, end);
- }
/*
* Get rid of page table information in the indicated region.
*
* Called with the mm semaphore held.
*/
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
struct mmu_gather *tlb;
unsigned long nr_accounted = 0;
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
update_hiwater_rss(mm);
//断开具体的vma映射
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
//因为删除了一些映射,会造成一个页表空闲的情况,回收页表项所占的空间
free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
next? next->vm_start: 0);
tlb_finish_mmu(tlb, start, end);
}
4.1.1.4.1 unmap_vmas
unmap_vmas用来释放pte所映射的页面。代码如下:
- //参数说明:
- //mm:进程描述符 vma:要删除的起始vma start_addr:要删除的线性区的起始地址
- // end_addr:要删除的线性区的结束地址 details:在调用的时候置为了NULL
- /**
- * unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
- * @vma: the starting vma
- * @start_addr: virtual address at which to start unmapping
- * @end_addr: virtual address at which to end unmapping
- * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
- * @details: details of nonlinear truncation or shared cache invalidation
- *
- * Returns the end address of the unmapping (restart addr if interrupted).
- *
- * Unmap all pages in the vma list.
- *
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
- *
- * Only addresses between `start' and `end' will be unmapped.
- *
- * The VMA list must be sorted in ascending virtual address order.
- *
- * unmap_vmas() assumes that the caller will flush the whole unmapped address
- * range after unmap_vmas() returns. So the only responsibility here is to
- * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
- * drops the lock and schedules.
- */
- unsigned long unmap_vmas(struct mmu_gather **tlbp,
- struct vm_area_struct *vma, unsigned long start_addr,
- unsigned long end_addr, unsigned long *nr_accounted,
- struct zap_details *details)
- {
- long zap_work = ZAP_BLOCK_SIZE;
- unsigned long tlb_start = 0; /* For tlb_finish_mmu */
- int tlb_start_valid = 0;
- unsigned long start = start_addr;
- spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
- int fullmm = (*tlbp)->fullmm;
- struct mm_struct *mm = vma->vm_mm;
- mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
- //遍历要删除的vma链表
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
- unsigned long end;
- //确定要断开映射的起始地址跟结束地址
- start = max(vma->vm_start, start_addr);
- if (start >= vma->vm_end)
- continue;
- end = min(vma->vm_end, end_addr);
- if (end <= vma->vm_start)
- continue;
- if (vma->vm_flags & VM_ACCOUNT)
- *nr_accounted += (end - start) >> PAGE_SHIFT;
- if (unlikely(is_pfn_mapping(vma)))
- untrack_pfn_vma(vma, 0, 0);
- //while循环开始断开start到end的所有被映射的页框,在足够的情况下一次释放zap_bytes
- while (start != end) {
- if (!tlb_start_valid) {
- tlb_start = start;
- tlb_start_valid = 1;
- }
- //在条件编译下is_vm_hugetlb_page()为空
- if (unlikely(is_vm_hugetlb_page(vma))) {
- /*
- * It is undesirable to test vma->vm_file as it
- * should be non-null for valid hugetlb area.
- * However, vm_file will be NULL in the error
- * cleanup path of do_mmap_pgoff. When
- * hugetlbfs ->mmap method fails,
- * do_mmap_pgoff() nullifies vma->vm_file
- * before calling this function to clean up.
- * Since no pte has actually been setup, it is
- * safe to do nothing in this case.
- */
- if (vma->vm_file) {
- unmap_hugepage_range(vma, start, end, NULL);
- zap_work -= (end - start) /
- pages_per_huge_page(hstate_vma(vma));
- }
- start = end;
- } else
- start = unmap_page_range(*tlbp, vma,
- start, end, &zap_work, details);
- if (zap_work > 0) {
- BUG_ON(start != end);
- break;
- }
- tlb_finish_mmu(*tlbp, tlb_start, start);
- if (need_resched() ||
- (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
- if (i_mmap_lock) {
- *tlbp = NULL;
- goto out;
- }
- cond_resched();
- }
- *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
- tlb_start_valid = 0;
- zap_work = ZAP_BLOCK_SIZE;
- }
- }
- out:
- mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
- return start; /* which is now the end (or restart) address */
- }
//参数说明:
//mm:进程描述符 vma:要删除的起始vma start_addr:要删除的线性区的起始地址
// end_addr:要删除的线性区的结束地址 details:在调用的时候置为了NULL
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlbp: address of the caller's struct mmu_gather
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
* Returns the end address of the unmapping (restart addr if interrupted).
*
* Unmap all pages in the vma list.
*
* We aim to not hold locks for too long (for scheduling latency reasons).
* So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
* return the ending mmu_gather to the caller.
*
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
*
* unmap_vmas() assumes that the caller will flush the whole unmapped address
* range after unmap_vmas() returns. So the only responsibility here is to
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
unsigned long unmap_vmas(struct mmu_gather **tlbp,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
long zap_work = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = (*tlbp)->fullmm;
struct mm_struct *mm = vma->vm_mm;
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
//遍历要删除的vma链表
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long end;
//确定要断开映射的起始地址跟结束地址
start = max(vma->vm_start, start_addr);
if (start >= vma->vm_end)
continue;
end = min(vma->vm_end, end_addr);
if (end <= vma->vm_start)
continue;
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT;
if (unlikely(is_pfn_mapping(vma)))
untrack_pfn_vma(vma, 0, 0);
//while循环开始断开start到end的所有被映射的页框,在足够的情况下一次释放zap_bytes
while (start != end) {
if (!tlb_start_valid) {
tlb_start = start;
tlb_start_valid = 1;
}
//在条件编译下is_vm_hugetlb_page()为空
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
* cleanup path of do_mmap_pgoff. When
* hugetlbfs ->mmap method fails,
* do_mmap_pgoff() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
if (vma->vm_file) {
unmap_hugepage_range(vma, start, end, NULL);
zap_work -= (end - start) /
pages_per_huge_page(hstate_vma(vma));
}
start = end;
} else
start = unmap_page_range(*tlbp, vma,
start, end, &zap_work, details);
if (zap_work > 0) {
BUG_ON(start != end);
break;
}
tlb_finish_mmu(*tlbp, tlb_start, start);
if (need_resched() ||
(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
if (i_mmap_lock) {
*tlbp = NULL;
goto out;
}
cond_resched();
}
*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
tlb_start_valid = 0;
zap_work = ZAP_BLOCK_SIZE;
}
}
out:
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
return start; /* which is now the end (or restart) address */
}
4.1.1.4.1.1 unmap_page_range
跟进unmap_page_range():
- static unsigned long unmap_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
- {
- pgd_t *pgd;
- unsigned long next;
- if (details && !details->check_mapping && !details->nonlinear_vma)
- details = NULL;
- BUG_ON(addr >= end);
- mem_cgroup_uncharge_start();
- tlb_start_vma(tlb, vma);
- //取得页目录
- pgd = pgd_offset(vma->vm_mm, addr);
- //断开pgd项对应的pmd
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd)) {
- (*zap_work)--;
- continue;
- }
- next = zap_pud_range(tlb, vma, pgd, addr, next,
- zap_work, details);
- } while (pgd++, addr = next, (addr != end && *zap_work > 0));
- tlb_end_vma(tlb, vma);
- mem_cgroup_uncharge_end();
- return addr;
- }
static unsigned long unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
long *zap_work, struct zap_details *details)
{
pgd_t *pgd;
unsigned long next;
if (details && !details->check_mapping && !details->nonlinear_vma)
details = NULL;
BUG_ON(addr >= end);
mem_cgroup_uncharge_start();
tlb_start_vma(tlb, vma);
//取得页目录
pgd = pgd_offset(vma->vm_mm, addr);
//断开pgd项对应的pmd
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
(*zap_work)--;
continue;
}
next = zap_pud_range(tlb, vma, pgd, addr, next,
zap_work, details);
} while (pgd++, addr = next, (addr != end && *zap_work > 0));
tlb_end_vma(tlb, vma);
mem_cgroup_uncharge_end();
return addr;
}
4.1.1.4.1.1.1 zap_pud_range
跟进zap_pud_range():
- static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
- {
- pud_t *pud;
- unsigned long next;
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud)) {
- (*zap_work)--;
- continue;
- }
- next = zap_pmd_range(tlb, vma, pud, addr, next,
- zap_work, details);
- } while (pud++, addr = next, (addr != end && *zap_work > 0));
- return addr;
- }
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
long *zap_work, struct zap_details *details)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud)) {
(*zap_work)--;
continue;
}
next = zap_pmd_range(tlb, vma, pud, addr, next,
zap_work, details);
} while (pud++, addr = next, (addr != end && *zap_work > 0));
return addr;
}
4.1.1.4.1.1.1.1 zap_pmd_range
转入zap_pmd_range():
- static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
- {
- pmd_t *pmd;
- unsigned long next;
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd)) {
- (*zap_work)--;
- continue;
- }
- next = zap_pte_range(tlb, vma, pmd, addr, next,
- zap_work, details);
- } while (pmd++, addr = next, (addr != end && *zap_work > 0));
- return addr;
- }
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
long *zap_work, struct zap_details *details)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd)) {
(*zap_work)--;
continue;
}
next = zap_pte_range(tlb, vma, pmd, addr, next,
zap_work, details);
} while (pmd++, addr = next, (addr != end && *zap_work > 0));
return addr;
}
4.1.1.4.1.1.1.1.1 zap_pte_range
继续跟进zap_pte_range():
- static unsigned long zap_pte_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- long *zap_work, struct zap_details *details)
- {
- struct mm_struct *mm = tlb->mm;
- pte_t *pte;
- spinlock_t *ptl;
- int rss[NR_MM_COUNTERS];
- init_rss_vec(rss);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
- arch_enter_lazy_mmu_mode();
- do {
- pte_t ptent = *pte;
- //pte没有映射页面
- if (pte_none(ptent)) {
- (*zap_work)--;
- continue;
- }
- (*zap_work) -= PAGE_SIZE;
- //相应的页在主存中
- if (pte_present(ptent)) {
- struct page *page;
- page = vm_normal_page(vma, addr, ptent);
- if (unlikely(details) && page) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping &&
- details->check_mapping != page->mapping)
- continue;
- /*
- * Each page->index must be checked when
- * invalidating or truncating nonlinear.
- */
- if (details->nonlinear_vma &&
- (page->index < details->first_index ||
- page->index > details->last_index))
- continue;
- }
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- tlb_remove_tlb_entry(tlb, pte, addr);
- if (unlikely(!page))
- continue;
- if (unlikely(details) && details->nonlinear_vma
- && linear_page_index(details->nonlinear_vma,
- addr) != page->index)
- set_pte_at(mm, addr, pte,
- pgoff_to_pte(page->index));
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else {
- if (pte_dirty(ptent))
- set_page_dirty(page);
- if (pte_young(ptent) &&
- likely(!VM_SequentialReadHint(vma)))
- mark_page_accessed(page);
- rss[MM_FILEPAGES]--;
- }
- page_remove_rmap(page);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- tlb_remove_page(tlb, page);
- continue;
- }
- /*
- * If details->check_mapping, we leave swap entries;
- * if details->nonlinear_vma, we leave file entries.
- */
- if (unlikely(details))
- continue;
- if (pte_file(ptent)) {
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
- print_bad_pte(vma, addr, ptent, NULL);
- } else {
- swp_entry_t entry = pte_to_swp_entry(ptent);
- if (!non_swap_entry(entry))
- rss[MM_SWAPENTS]--;
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
- }
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
- add_mm_rss_vec(mm, rss);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
- return addr;
- }
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
long *zap_work, struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
pte_t *pte;
spinlock_t *ptl;
int rss[NR_MM_COUNTERS];
init_rss_vec(rss);
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
//pte没有映射页面
if (pte_none(ptent)) {
(*zap_work)--;
continue;
}
(*zap_work) -= PAGE_SIZE;
//相应的页在主存中
if (pte_present(ptent)) {
struct page *page;
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
details->check_mapping != page->mapping)
continue;
/*
* Each page->index must be checked when
* invalidating or truncating nonlinear.
*/
if (details->nonlinear_vma &&
(page->index < details->first_index ||
page->index > details->last_index))
continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
addr) != page->index)
set_pte_at(mm, addr, pte,
pgoff_to_pte(page->index));
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
if (pte_dirty(ptent))
set_page_dirty(page);
if (pte_young(ptent) &&
likely(!VM_SequentialReadHint(vma)))
mark_page_accessed(page);
rss[MM_FILEPAGES]--;
}
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
tlb_remove_page(tlb, page);
continue;
}
/*
* If details->check_mapping, we leave swap entries;
* if details->nonlinear_vma, we leave file entries.
*/
if (unlikely(details))
continue;
if (pte_file(ptent)) {
if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
print_bad_pte(vma, addr, ptent, NULL);
} else {
swp_entry_t entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry))
rss[MM_SWAPENTS]--;
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
return addr;
}
4.1.1.4.2 free_pgtables
通过上面的分析可以看到,内核是如何通过线性地址从pgd找到pte再释放相关页面的。到这一步,注意到,只是释放了pte所映射的页框,所以,可能会造成有很多pte项没有映射的状态,这部份pte所占的空间其实是可以回收的。它是在free_pgtables()函数中完成的。代码如下:
- void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
- {
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
- unsigned long addr = vma->vm_start;// 把虚拟区的起始地址赋给addr
- /*
- * Hide vma from rmap and truncate_pagecache before freeing
- * pgtables
- */
- unlink_anon_vmas(vma);
- unlink_file_vma(vma);
- if (is_vm_hugetlb_page(vma)) {
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
- } else {
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = vma->vm_next;
- unlink_anon_vmas(vma);
- unlink_file_vma(vma);
- }
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next? next->vm_start: ceiling);
- }
- vma = next;
- }
- }
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long floor, unsigned long ceiling)
{
while (vma) {
struct vm_area_struct *next = vma->vm_next;
unsigned long addr = vma->vm_start;// 把虚拟区的起始地址赋给addr
/*
* Hide vma from rmap and truncate_pagecache before freeing
* pgtables
*/
unlink_anon_vmas(vma);
unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) {
hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
} else {
/*
* Optimization: gather nearby vmas into one call down
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
next = vma->vm_next;
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
free_pgd_range(tlb, addr, vma->vm_end,
floor, next? next->vm_start: ceiling);
}
vma = next;
}
}
prev指向的是什么?
调用这个函数的时候,prev指向的是什么区域的vma呢?
刚开始的时候:
detach_vmas_to_be_unmapped后:
看上面可以看出: clear_page_tables中,要操作的线性地址即为prev,prev->next之间的空洞线性地址。
更多推荐
所有评论(0)