/* * this is really a simplified "do_mmap". it only handles * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */staticintdo_brk_flags(unsignedlong addr,unsignedlong len,unsignedlong flags,struct list_head *uf){struct mm_struct *mm =current->mm;struct vm_area_struct *vma,*prev;struct rb_node **rb_link,*rb_parent;...... /* * Clear old maps. this also does some error checking for us */while (find_vma_links(mm, addr, addr + len,&prev,&rb_link,&rb_parent)) {if (do_munmap(mm, addr, len, uf))return-ENOMEM; }...... /* Can we just expand an old private anonymous mapping? */ vma =vma_merge(mm, prev, addr, addr + len, flags,NULL,NULL, pgoff,NULL, NULL_VM_UFFD_CTX);if (vma)goto out; /* * create a vma struct for an anonymous mapping */ vma =vm_area_alloc(mm);if (!vma) {vm_unacct_memory(len >> PAGE_SHIFT);return-ENOMEM; }vma_set_anonymous(vma);vma->vm_start = addr;vma->vm_end = addr + len;vma->vm_pgoff = pgoff;vma->vm_flags = flags;vma->vm_page_prot =vm_get_page_prot(flags);vma_link(mm, vma, prev, rb_link, rb_parent);out:perf_event_mmap(vma);mm->total_vm += len >> PAGE_SHIFT;mm->data_vm += len >> PAGE_SHIFT;if (flags & VM_LOCKED)mm->locked_vm += (len >> PAGE_SHIFT);vma->vm_flags |= VM_SOFTDIRTY;return0;}
unsignedlongksys_mmap_pgoff(unsignedlong addr,unsignedlong len,unsignedlong prot,unsignedlong flags,unsignedlong fd,unsignedlong pgoff){struct file *file =NULL;unsignedlong retval;if (!(flags & MAP_ANONYMOUS)) {audit_mmap_fd(fd, flags); file =fget(fd);if (!file)return-EBADF;if (is_file_hugepages(file)) len =ALIGN(len, huge_page_size(hstate_file(file))); retval =-EINVAL;if (unlikely(flags & MAP_HUGETLB &&!is_file_hugepages(file)))goto out_fput; } elseif (flags & MAP_HUGETLB) {struct user_struct *user =NULL;struct hstate *hs; hs =hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (!hs)return-EINVAL; len =ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ file =hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,&user, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);if (IS_ERR(file))returnPTR_ERR(file); } flags &=~(MAP_EXECUTABLE | MAP_DENYWRITE); retval =vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);out_fput:if (file)fput(file);return retval;}
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,unsignedlong error_code,unsignedlong address){...... /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */if (unlikely(fault_in_kernel_space(address))) {if (vmalloc_fault(address)>=0)return;if (notify_page_fault(regs, vec))return;bad_area_nosemaphore(regs, error_code, address);return; }...... vma =find_vma(mm, address);...... /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ fault =handle_mm_fault(vma, address, flags);......}
/* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow * concurrent faults). * * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */staticvm_fault_thandle_pte_fault(struct vm_fault *vmf){pte_t entry;...... /* * A regular pmd is established and it can't morph into a huge * pmd from under us anymore at this point because we hold the * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */vmf->pte =pte_offset_map(vmf->pmd,vmf->address);vmf->orig_pte =*vmf->pte;......if (!vmf->pte) {if (vma_is_anonymous(vmf->vma))returndo_anonymous_page(vmf);elsereturndo_fault(vmf); }if (!pte_present(vmf->orig_pte))returndo_swap_page(vmf);......}
/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */staticvm_fault_tdo_anonymous_page(struct vm_fault *vmf){struct vm_area_struct *vma =vmf->vma;struct mem_cgroup *memcg;struct page *page;vm_fault_t ret =0;pte_t entry;...... /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created * from a different thread. * * pte_alloc_map() is safe to use under down_write(mmap_sem) or when * parallel threads are excluded by other means. * * Here we only have down_read(mmap_sem). */if (pte_alloc(vma->vm_mm,vmf->pmd))return VM_FAULT_OOM;...... page =alloc_zeroed_user_highpage_movable(vma,vmf->address);...... entry =mk_pte(page,vma->vm_page_prot);if (vma->vm_flags & VM_WRITE) entry =pte_mkwrite(pte_mkdirty(entry));vmf->pte =pte_offset_map_lock(vma->vm_mm,vmf->pmd,vmf->address,&vmf->ptl);......set_pte_at(vma->vm_mm,vmf->address,vmf->pte, entry);......}#define__alloc_zeroed_user_highpage(movableflags, vma, vaddr) \alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
static noinline intvmalloc_fault(unsignedlong address){unsignedlong pgd_paddr;pmd_t*pmd_k;pte_t*pte_k; /* Make sure we are in vmalloc area: */if (!(address >= VMALLOC_START && address < VMALLOC_END))return-1; /* * Synchronize this task's top level page-table * with the 'reference' page table. * * Do _not_ use "current" here. We might be inside * an interrupt in the middle of a task switch.. */ pgd_paddr =read_cr3_pa(); pmd_k =vmalloc_sync_one(__va(pgd_paddr), address);if (!pmd_k)return-1; pte_k =pte_offset_kernel(pmd_k, address);if (!pte_present(*pte_k))return-1;return0}