Linux内核攻击:Punchhole(2025Backdoorskernel复现)

admin 2026-01-30 18:17:48 网络安全文章 来源:ZONE.CI 全球网 0 阅读模式

文章总结: 本文介绍在禁用userfault/Fuse环境下,利用LinuxfallocatePunchHole特性阻塞copy_from_user的攻击方法。通过shmem文件制造空洞触发缺页中断阻塞线程,为UAF/DoubleFree制造竞争窗口。文章详析了fallocate与缺页处理内核源码,并分享了跨页布局实现任意offset读写的技巧,对CTF复现及内核安全研究具有重要参考价值。 综合评分: 90 文章分类: 二进制安全,CTF,漏洞分析


cover_image

Linux 内核攻击:Punch hole (2025 Backdoor skernel 复现)

Elenia Elenia

看雪学苑

2026年1月29日 18:00 上海

打 Backdoor 的 skernel 时,在 Tplus 大佬那里学习了一个在不能使用 userfault 和 Fuse 的情况下完成对地址访问的阻塞的攻击方法,个人觉得非常好用。限于篇幅,本篇文章主要聚焦于 punch hole 的原理。(当然我也不能确定 punch hole 是预期打法)

01

环境说明

  • 本文中关于 Linux 源码的分析均是基于: Linux 6.12.32 版本
  • 题目内核版本: Linux 6.18.0

02

题目

00000000 struct __fixed struct_3 // sizeof=0x18
00000000 {                                       // XREF: module_ioctl/r
00000000     int index;                          // XREF: module_ioctl+25/w
00000000                                         // module_ioctl+60/r ...
00000004     int field_4;
00000008     __int64 length;                     // XREF: module_ioctl+2E/w
00000008                                         // module_ioctl+6B/r ...
00000010     __int64 buf;                        // XREF: module_ioctl+37/w
00000010                                         // module_ioctl+169/r ...
00000018 };
__int64 __fastcall module_ioctl(__int64 a1, int a2, __int64 a3){
  __int64 v4; // rbx
  __int64 v5; // rax
  __int64 v7; // rdx
  __int64 v8; // rdi
  struct_3 v9; // [rsp+8h] [rbp-70h] BYREF
  _BYTE v10[64]; // [rsp+20h] [rbp-58h] BYREF
unsigned __int64 v11; // [rsp+60h] [rbp-18h]

  v11 = __readgsqword((unsignedint)&_ref_stack_chk_guard);
memset(&v9, 0, sizeof(v9));
mutex_lock(&skernel_mutex);
if ( copy_from_user(&v9, a3, 24) )
  {
    v4 = -14;
goto LABEL_10;
  }
if ( v9.index > 7u )
  {
    v4 = -22;
goto LABEL_10;
  }
  v4 = -22;
if ( (unsigned __int64)(v9.length - 1) > 0x3F )
  {
LABEL_10:
mutex_unlock(&skernel_mutex);
return v4;
  }
mutex_unlock(&skernel_mutex);
switch ( a2 )
  {
case 322376504:
memset(v10, 0, sizeof(v10));
      v7 = allocated_objects[v9.index];
if ( !v7 )
return -1;
      v4 = 0;
kfree(v7);
copy_from_user(v10, v9.buf, 64);
      allocated_objects[v9.index] = 0;
return v4;
case 322376505:
      v8 = allocated_objects[v9.index];
if ( !v8 )
return -1;
if ( v9.length > 0x7FFFFFFFuLL )
BUG();
return -(copy_from_user(v8, v9.buf, v9.length) != 0);
case 322376503:
if ( allocated_objects[v9.index] )
return -1;
      v4 = 0;
      v5 = _kmalloc_cache_noprof(kmalloc_caches[6], 3520, 64);
      allocated_objects[v9.index] = v5;
return v4;
  }
return module_ioctl_cold();
}

可以看见我们在free后虽然进行了置空,但是在那之前有copy_from_user(v10, v9.buf, 64);所以我们可以通过类似于 userfault 的方式延长这个copy_from_user的环节,从而让已经free后的obj残留在数组中。在这期间我们可以通过write实现uaf或者通过free实现doubefree。

03

Punch hole

这题目环境并不支持 userfault 和 fuse。所以我们可以用 https://starlabs.sg/blog/2023/07-a-new-method-for-container-escape-using-file-based-dirtycred/ 这篇文章中提及的 punch hole 的方式,替代这两种方式,实现条件竞争.当然 punch hole 也不能完美替代这两种方案,本质只是延长了访问的时间而不是可以实现任意控制的延长。

攻击方式其实就是将我们的buf地址对应的内存 fallocate 丢弃,制造内存空洞。从而让 copy_from_user 函数访问这个地址的时候会发生缺页中断,然后去处理对应内容。然后由于fallocate会给已经丢弃的内容上锁,所以后续读写会等待打洞完成。(在后文中我们也主要是介绍为什么 fallocate 后的内存访问流程)

04

核心数据结构

struct shmem_falloc {
wait_queue_head_t *waitq;      // 等待队列(用于同步)
pgoff_t start;                 // 打洞起始页号
pgoff_t next;                  // 打洞结束页号(下一个页号)
pgoff_t nr_falloced;           // 已分配的页数(用于普通 fallocate)
pgoff_t nr_unswapped;          // 已交换的页数(用于普通 fallocate)
};

#

05

fallocate

fallocate() 用于预分配或释放文件空间。

  • Mode: FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE (后文主要以这个调用链为主)

  • FALLOC_FL_PUNCH_HOLE:在文件中打洞,释放指定范围的磁盘块

  • FALLOC_FL_KEEP_SIZE:保持文件大小不变(不改变 inode->i_size)

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
return ksys_fallocate(fd, mode, offset, len);
}

流程

  1. 设置阶段:创建等待队列,设置 inode->i_private 标记 (这样后续只需要检查i_private了)
  2. 取消映射:unmap_mapping_range() 取消所有 VMA 中的映射
  3. 截断缓存:shmem_truncate_range() → shmem_undo_range() 删除页面缓存和交换条目
  4. 清理阶段:清除标记,唤醒等待的 page fault 线程

vfs_fallocate

  • 模式验证:FALLOC_FL_PUNCH_HOLE 必须与 FALLOC_FL_KEEP_SIZE 一起使用
  • 权限检查:需要写权限(FMODE_WRITE)
  • 文件类型:必须是普通文件或块设备
  • 最终调用:file->f_op->fallocate(file, mode, offset, len) 会有几种情况。
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        long ret;
        loff_t sum;

if&nbsp;(offset <&nbsp;0&nbsp;|| len <=&nbsp;0)
return&nbsp;-EINVAL;

if&nbsp;(mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
return&nbsp;-EOPNOTSUPP;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Modes are exclusive, even if that is not obvious from the encoding
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* as bit masks and the mix with the flag in the same namespace.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* encoded as no bit set.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; switch (mode & FALLOC_FL_MODE_MASK) {
&nbsp; &nbsp; &nbsp; &nbsp; case FALLOC_FL_ALLOCATE_RANGE:
&nbsp; &nbsp; &nbsp; &nbsp; case FALLOC_FL_UNSHARE_RANGE:
&nbsp; &nbsp; &nbsp; &nbsp; case FALLOC_FL_ZERO_RANGE:
break;
&nbsp; &nbsp; &nbsp; &nbsp; case FALLOC_FL_PUNCH_HOLE:
if&nbsp;(!(mode & FALLOC_FL_KEEP_SIZE))
return&nbsp;-EOPNOTSUPP;
break;
&nbsp; &nbsp; &nbsp; &nbsp; case FALLOC_FL_COLLAPSE_RANGE:
&nbsp; &nbsp; &nbsp; &nbsp; case FALLOC_FL_INSERT_RANGE:
if&nbsp;(mode & FALLOC_FL_KEEP_SIZE)
return&nbsp;-EOPNOTSUPP;
break;
&nbsp; &nbsp; &nbsp; &nbsp; default:
return&nbsp;-EOPNOTSUPP;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(!(file->f_mode & FMODE_WRITE))
return&nbsp;-EBADF;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* On append-only files only space preallocation is supported.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
return&nbsp;-EPERM;

if&nbsp;(IS_IMMUTABLE(inode))
return&nbsp;-EPERM;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* We cannot allow any fallocate operation on an active swapfile
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(IS_SWAPFILE(inode))
return&nbsp;-ETXTBSY;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Revalidate the write permissions, in case security policy has
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* changed since the files were opened.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; ret = security_file_permission(file, MAY_WRITE);
if&nbsp;(ret)
return&nbsp;ret;

&nbsp; &nbsp; &nbsp; &nbsp; ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
if&nbsp;(ret)
return&nbsp;ret;

if&nbsp;(S_ISFIFO(inode->i_mode))
return&nbsp;-ESPIPE;

if&nbsp;(S_ISDIR(inode->i_mode))
return&nbsp;-EISDIR;

if&nbsp;(!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
return&nbsp;-ENODEV;

/* Check for wraparound */
if&nbsp;(check_add_overflow(offset, len, &sum))
return&nbsp;-EFBIG;

if&nbsp;(sum > inode->i_sb->s_maxbytes)
return&nbsp;-EFBIG;

if&nbsp;(!file->f_op->fallocate)
return&nbsp;-EOPNOTSUPP;

&nbsp; &nbsp; &nbsp; &nbsp; file_start_write(file);
// 最终调用 f_op->fallocate
&nbsp; &nbsp; &nbsp; &nbsp; ret = file->f_op->fallocate(file, mode, offset, len);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Create inotify and fanotify events.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* To keep the logic simple always create events if fallocate succeeds.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* This implies that events are even created if the file size remains
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(ret ==&nbsp;0)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; fsnotify_modify(file);

&nbsp; &nbsp; &nbsp; &nbsp; file_end_write(file);
return&nbsp;ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

ext4 文件 实现

也就是我们正常创建的文件.

ext4_fallocate (ext4文件系统实现)

具体根据不同 file->f_op->fallocate 的实现为主,这里用ext4文件系统举列子。

  • 检查加密文件系统限制
  • 验证支持的模式
  • ext4_punch_hole 最后调用
/*
&nbsp;* preallocate space for a file. This implements ext4's fallocate file
&nbsp;* operation, which gets called from sys_fallocate system call.
&nbsp;* For block-mapped files, posix_fallocate should fall back to the method
&nbsp;* of writing zeroes to the required new blocks (the same behavior which is
&nbsp;* expected for file systems which do not support fallocate() system call).
&nbsp;*/
longext4_fallocate(struct&nbsp;file *file,&nbsp;int&nbsp;mode,&nbsp;loff_t&nbsp;offset,&nbsp;loff_t&nbsp;len){
struct&nbsp;inode&nbsp;*inode =&nbsp;file_inode(file);
loff_t&nbsp;new_size =&nbsp;0;
unsignedint&nbsp;max_blocks;
int&nbsp;ret =&nbsp;0;
int&nbsp;flags;
ext4_lblk_t&nbsp;lblk;
unsignedint&nbsp;blkbits = inode->i_blkbits;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Encrypted inodes can't handle collapse range or insert
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* range since we would need to re-encrypt blocks with a
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* different IV or XTS tweak (which are based on the logical
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* block number).
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(IS_ENCRYPTED(inode) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return&nbsp;-EOPNOTSUPP;

/* Return error if mode is not supported */
if&nbsp;(mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;FALLOC_FL_INSERT_RANGE))
return&nbsp;-EOPNOTSUPP;

inode_lock(inode);
&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_convert_inline_data(inode);
inode_unlock(inode);
if&nbsp;(ret)
goto&nbsp;exit;

if&nbsp;(mode & FALLOC_FL_PUNCH_HOLE) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_punch_hole(file, offset, len);
goto&nbsp;exit;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(mode & FALLOC_FL_COLLAPSE_RANGE) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_collapse_range(file, offset, len);
goto&nbsp;exit;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(mode & FALLOC_FL_INSERT_RANGE) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_insert_range(file, offset, len);
goto&nbsp;exit;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(mode & FALLOC_FL_ZERO_RANGE) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_zero_range(file, offset, len, mode);
goto&nbsp;exit;
&nbsp; &nbsp; &nbsp; &nbsp; }
trace_ext4_fallocate_enter(inode, offset, len, mode);
&nbsp; &nbsp; &nbsp; &nbsp; lblk = offset >> blkbits;

&nbsp; &nbsp; &nbsp; &nbsp; max_blocks =&nbsp;EXT4_MAX_BLOCKS(len, offset, blkbits);
&nbsp; &nbsp; &nbsp; &nbsp; flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;

inode_lock(inode);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* We only support preallocation for extent-based files only
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = -EOPNOTSUPP;
goto&nbsp;out;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(!(mode & FALLOC_FL_KEEP_SIZE) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; (offset + len > inode->i_size ||
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;offset + len >&nbsp;EXT4_I(inode)->i_disksize)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; new_size = offset + len;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;inode_newsize_ok(inode, new_size);
if&nbsp;(ret)
goto&nbsp;out;
&nbsp; &nbsp; &nbsp; &nbsp; }

/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);

&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;file_modified(file);
if&nbsp;(ret)
goto&nbsp;out;

&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if&nbsp;(ret)
goto&nbsp;out;

if&nbsp;(file->f_flags & O_SYNC &&&nbsp;EXT4_SB(inode->i_sb)->s_journal) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
&nbsp; &nbsp; &nbsp; &nbsp; }
out:
inode_unlock(inode);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
exit:
return&nbsp;ret;
}

ext4_punch_hole(核心函数)

  • 边界检查

  • 如果 offset >= inode->i_size,直接返回

  • 如果超出文件大小,调整长度

  • 对齐处理

  • 将偏移量和长度对齐到块边界(round_up/down)

  • 清除页面缓存

intext4_punch_hole(struct&nbsp;file *file,&nbsp;loff_t&nbsp;offset,&nbsp;loff_t&nbsp;length){
struct&nbsp;inode&nbsp;*inode =&nbsp;file_inode(file);
struct&nbsp;super_block&nbsp;*sb = inode->i_sb;
ext4_lblk_t&nbsp;first_block, stop_block;
struct&nbsp;address_space&nbsp;*mapping = inode->i_mapping;
loff_t&nbsp;first_block_offset, last_block_offset, max_length;
struct&nbsp;ext4_sb_info&nbsp;*sbi =&nbsp;EXT4_SB(inode->i_sb);
handle_t&nbsp;*handle;
unsignedint&nbsp;credits;
int&nbsp;ret =&nbsp;0, ret2 =&nbsp;0;

trace_ext4_punch_hole(inode, offset, length,&nbsp;0);

inode_lock(inode);

/* No need to punch hole beyond i_size */
if&nbsp;(offset >= inode->i_size)
goto&nbsp;out_mutex;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If the hole extends beyond i_size, set the hole
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* to end after the page that contains i_size
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(offset + length > inode->i_size) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; length = inode->i_size +
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;PAGE_SIZE - (inode->i_size & (PAGE_SIZE -&nbsp;1)) -
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;offset;
&nbsp; &nbsp; &nbsp; &nbsp; }

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* For punch hole the length + offset needs to be within one block
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* before last range. Adjust the length if it goes beyond that limit.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
if&nbsp;(offset + length > max_length)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; length = max_length - offset;

if&nbsp;(offset & (sb->s_blocksize -&nbsp;1) ||
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; (offset + length) & (sb->s_blocksize -&nbsp;1)) {
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Attach jinode to inode for jbd2 if we do any zeroing of
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* partial block
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_inode_attach_jinode(inode);
if&nbsp;(ret <&nbsp;0)
goto&nbsp;out_mutex;

&nbsp; &nbsp; &nbsp; &nbsp; }
// 等待所有直接 I/O 完成
/* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);

&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;file_modified(file);
if&nbsp;(ret)
goto&nbsp;out_mutex;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Prevent page faults from reinstantiating pages we have released from
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* page cache.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
filemap_invalidate_lock(mapping);

&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_break_layouts(inode);
if&nbsp;(ret)
goto&nbsp;out_dio;

&nbsp; &nbsp; &nbsp; &nbsp; first_block_offset =&nbsp;round_up(offset, sb->s_blocksize);
&nbsp; &nbsp; &nbsp; &nbsp; last_block_offset =&nbsp;round_down((offset + length), sb->s_blocksize) -&nbsp;1;

/* Now release the pages and zero block aligned part of pages*/
if&nbsp;(last_block_offset > first_block_offset) {
// 释放指定范围内的页面缓存
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_update_disksize_before_punch(inode, offset, length);
if&nbsp;(ret)
goto&nbsp;out_dio;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_truncate_page_cache_block_range(inode,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; first_block_offset, last_block_offset +&nbsp;1);
if&nbsp;(ret)
goto&nbsp;out_dio;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; credits =&nbsp;ext4_writepage_trans_blocks(inode);
else
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; credits =&nbsp;ext4_blocks_for_truncate(inode);
&nbsp; &nbsp; &nbsp; &nbsp; handle =&nbsp;ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if&nbsp;(IS_ERR(handle)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;PTR_ERR(handle);
ext4_std_error(sb, ret);
goto&nbsp;out_dio;
&nbsp; &nbsp; &nbsp; &nbsp; }
// 零化部分块
&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_zero_partial_blocks(handle, inode, offset,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;length);
if&nbsp;(ret)
goto&nbsp;out_stop;

&nbsp; &nbsp; &nbsp; &nbsp; first_block = (offset + sb->s_blocksize -&nbsp;1) >>
EXT4_BLOCK_SIZE_BITS(sb);
&nbsp; &nbsp; &nbsp; &nbsp; stop_block = (offset + length) >>&nbsp;EXT4_BLOCK_SIZE_BITS(sb);

/* If there are blocks to remove, do it */
if&nbsp;(stop_block > first_block) {
ext4_lblk_t&nbsp;hole_len = stop_block - first_block;

down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);

ext4_es_remove_extent(inode, first_block, hole_len);
// 删除块 从 extent 树或间接块中删除块
if&nbsp;(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_ext_remove_space(inode, first_block,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; stop_block -&nbsp;1);
else
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;ext4_ind_remove_space(handle, inode, first_block,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; stop_block);
// 标记为 HOLE 在 extent 状态树中标记为 EXTENT_STATUS_HOLE
ext4_es_insert_extent(inode, first_block, hole_len, ~0,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; EXTENT_STATUS_HOLE,&nbsp;0);
up_write(&EXT4_I(inode)->i_data_sem);
&nbsp; &nbsp; &nbsp; &nbsp; }
ext4_fc_track_range(handle, inode, first_block, stop_block);
if&nbsp;(IS_SYNC(inode))
ext4_handle_sync(handle);

inode_set_mtime_to_ts(inode,&nbsp;inode_set_ctime_current(inode));
&nbsp; &nbsp; &nbsp; &nbsp; ret2 =&nbsp;ext4_mark_inode_dirty(handle, inode);
if&nbsp;(unlikely(ret2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = ret2;
if&nbsp;(ret >=&nbsp;0)
ext4_update_inode_fsync_trans(handle, inode,&nbsp;1);
out_stop:
ext4_journal_stop(handle);
out_dio:
filemap_invalidate_unlock(mapping);
out_mutex:
inode_unlock(inode);
return&nbsp;ret;
}

shmem/tmpfs 文件 实现

tmpfs 文件就是 /tmp 目录下的文件

Shmem 文件就是 memfd_create 创建的fd

流程图

用户空间: fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, len)
&nbsp; &nbsp; ↓
系统调用: SYSCALL_DEFINE4(fallocate, ...)
&nbsp; &nbsp; ↓
ksys_fallocate()
&nbsp; &nbsp; ↓
vfs_fallocate()
&nbsp; &nbsp; ↓
file->f_op->fallocate() &nbsp;[对于 shmem 文件,指向 shmem_fallocate]
&nbsp; &nbsp; ↓
┌─────────────────────────────────────────────────────────────┐
│ shmem_fallocate() [mm/shmem.c:3376] &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│&nbsp;if&nbsp;(mode & FALLOC_FL_PUNCH_HOLE) { &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; ├─ 1. 设置等待队列和标记 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; └─ inode->i_private = &shmem_falloc &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; ├─ 2. unmap_mapping_range() [mm/memory.c:3857] &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; └─ unmap_mapping_pages() [mm/memory.c:3821] &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; └─ unmap_mapping_range_tree() &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; └─ unmap_mapping_range_vma() &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; └─ zap_page_range_single() &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; └─ 取消映射所有 VMA 中的页面 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; ├─ 3. shmem_truncate_range() [mm/shmem.c:1146] &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; └─ shmem_undo_range() [mm/shmem.c:995] &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; ├─ find_lock_entries() [mm/filemap.c] &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; │ &nbsp; └─ 查找并锁定页面缓存中的 folio &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; ├─ truncate_inode_folio() [mm/truncate.c] &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; │ &nbsp; └─ 截断完整的 folio &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; ├─ truncate_inode_partial_folio() [mm/truncate.c]│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; │ &nbsp; └─ 截断部分 folio(处理跨页边界) &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; ├─ shmem_free_swap() &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; │ &nbsp; &nbsp; &nbsp; │ &nbsp; └─ 释放交换空间条目 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; └─ shmem_recalc_inode() &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; └─ 重新计算 inode 的统计信息 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; └─ 4. 清除标记并唤醒等待线程 &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp; &nbsp; ├─ inode->i_private = NULL &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp; &nbsp; └─ wake_up_all(&shmem_falloc_waitq) &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
└─────────────────────────────────────────────────────────────┘

shmem_fallocate (设置等待队列)

  • DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq) 在栈上创建等待队列
  • inode->i_private = &shmem_falloc 标记正在打洞,其他线程可通过此字段检测
  • shmem_falloc.start 和 shmem_falloc.next 记录打洞的页范围
  • 完成后 wake_up_all() 唤醒等待线程
staticlongshmem_fallocate(struct&nbsp;file *file,&nbsp;int&nbsp;mode,&nbsp;loff_t&nbsp;offset,
loff_t&nbsp;len){
struct&nbsp;inode&nbsp;*inode =&nbsp;file_inode(file);
struct&nbsp;shmem_sb_info&nbsp;*sbinfo =&nbsp;SHMEM_SB(inode->i_sb);
struct&nbsp;shmem_inode_info&nbsp;*info =&nbsp;SHMEM_I(inode);
struct&nbsp;shmem_falloc&nbsp;shmem_falloc;
pgoff_t&nbsp;start, index, end, undo_fallocend;
int&nbsp;error;

// 检查模式:只支持 KEEP_SIZE 和 PUNCH_HOLE
if&nbsp;(mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return&nbsp;-EOPNOTSUPP;

// 获取 inode 锁(防止并发修改)
inode_lock(inode);

if&nbsp;(mode & FALLOC_FL_PUNCH_HOLE) {
struct&nbsp;address_space&nbsp;*mapping = file->f_mapping;
// 对齐到页边界:向上取整起始位置,向下取整结束位置
loff_t&nbsp;unmap_start =&nbsp;round_up(offset, PAGE_SIZE);
loff_t&nbsp;unmap_end =&nbsp;round_down(offset + len, PAGE_SIZE) -&nbsp;1;
// 在栈上声明等待队列头(用于同步打洞和 page fault)
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);

// 检查文件密封:如果文件被密封为只写,不允许打洞
if&nbsp;(info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; error = -EPERM;
goto&nbsp;out;
&nbsp; &nbsp; &nbsp; &nbsp; }

// 步骤1: 设置 shmem_falloc 结构
&nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc.waitq = &shmem_falloc_waitq; &nbsp;// 等待队列
&nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; &nbsp;// 起始页号
&nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc.next = (unmap_end +&nbsp;1) >> PAGE_SHIFT; &nbsp; &nbsp;// 结束页号+1

// 步骤2: 将 shmem_falloc 保存到 inode->i_private
// 这样 shmem_fault() 可以检测到正在打洞
spin_lock(&inode->i_lock);
&nbsp; &nbsp; &nbsp; &nbsp; inode->i_private = &shmem_falloc;
spin_unlock(&inode->i_lock);

// 步骤3: 取消映射 VMA 中的页面(如果有多个进程映射)
if&nbsp;((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1&nbsp;+ unmap_end - unmap_start,&nbsp;0);

// 步骤4: 截断页面缓存和交换空间
shmem_truncate_range(inode, offset, offset + len -&nbsp;1);
/* No need to unmap again: hole-punching leaves COWed pages */

// 步骤5: 清除标记并唤醒所有等待的线程
spin_lock(&inode->i_lock);
&nbsp; &nbsp; &nbsp; &nbsp; inode->i_private =&nbsp;NULL; &nbsp;// 清除标记
wake_up_all(&shmem_falloc_waitq); &nbsp;// 唤醒等待的 page fault 线程
WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
spin_unlock(&inode->i_lock);
&nbsp; &nbsp; &nbsp; &nbsp; error =&nbsp;0;
goto&nbsp;out;
&nbsp; &nbsp; }

unmap_mapping_range

  • 将字节范围转换为页号范围
  • 调用 unmap_mapping_pages() 执行取消映射
voidunmap_mapping_range(struct&nbsp;address_space *mapping,
loff_tconst&nbsp;holebegin,&nbsp;loff_tconst&nbsp;holelen,&nbsp;int&nbsp;even_cows){
// 将字节偏移转换为页号
pgoff_t&nbsp;hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
pgoff_t&nbsp;hlen = ((pgoff_t)(holelen) + PAGE_SIZE -&nbsp;1) >> PAGE_SHIFT;

// 检查溢出
if&nbsp;(sizeof(holelen) >&nbsp;sizeof(hlen)) {
longlong&nbsp;holeend =
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; (holebegin + holelen + PAGE_SIZE -&nbsp;1) >> PAGE_SHIFT;
if&nbsp;(holeend & ~(longlong)ULONG_MAX)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; hlen = ULONG_MAX - hba +&nbsp;1;
&nbsp; &nbsp; }

// 调用实际取消映射函数
unmap_mapping_pages(mapping, hba, hlen, even_cows);
}

#

unmap_mapping_pages(取消映射)
  • 遍历 i_mmap 树(所有映射该文件的 VMA)
  • 对每个 VMA 调用 unmap_mapping_range_vma() 取消映射
voidunmap_mapping_pages(struct&nbsp;address_space *mapping,&nbsp;pgoff_t&nbsp;start,
pgoff_t&nbsp;nr,&nbsp;bool&nbsp;even_cows){
struct&nbsp;zap_details&nbsp;details = { };
pgoff_t&nbsp;first_index = start;
pgoff_t&nbsp;last_index = start + nr -&nbsp;1;

&nbsp; &nbsp; details.even_cows = even_cows; &nbsp;// 是否取消映射 COW 页面
if&nbsp;(last_index < first_index)
&nbsp; &nbsp; &nbsp; &nbsp; last_index = ULONG_MAX;

// 获取映射锁(读锁,允许多个读者)
i_mmap_lock_read(mapping);
// 如果 i_mmap 树不为空,遍历并取消映射
if&nbsp;(unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;last_index, &details);
i_mmap_unlock_read(mapping);
}

shmem_truncate_range(截断范围)

文件系统上下文中,“截断”指从页面缓存(page cache)中删除指定范围的页面(folio),使页面缓存与文件状态一致。

  • 遍历 i_mmap 树(所有映射该文件的 VMA)
  • 对每个 VMA 调用 unmap_mapping_range_vma() 取消映射
voidshmem_truncate_range(struct&nbsp;inode *inode,&nbsp;loff_t&nbsp;lstart,&nbsp;loff_t&nbsp;lend){
// 调用核心实现函数
shmem_undo_range(inode, lstart, lend,&nbsp;false);
// 更新修改时间和变更时间
inode_set_mtime_to_ts(inode,&nbsp;inode_set_ctime_current(inode));
// 增加版本号(用于缓存失效)
inode_inc_iversion(inode);
}
截断前:
┌─────────────────────────────────────────┐
│ &nbsp;address_space->i_pages&nbsp;(XArray) &nbsp; &nbsp; &nbsp; │
│ &nbsp;┌─────┬─────┬─────┬─────┬─────┐ &nbsp; &nbsp; &nbsp;│
│ &nbsp;│ &nbsp;0&nbsp; │ &nbsp;1&nbsp; │ &nbsp;2&nbsp; │ &nbsp;3&nbsp; │ &nbsp;4&nbsp; │ &nbsp; &nbsp; &nbsp;│
│ &nbsp;└─────┴─────┴─────┴─────┴─────┘ &nbsp; &nbsp; &nbsp;│
│ &nbsp; &nbsp;│ &nbsp; &nbsp; │ &nbsp; &nbsp; │ &nbsp; &nbsp; │ &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp;▼ &nbsp; &nbsp; ▼ &nbsp; &nbsp; ▼ &nbsp; &nbsp; ▼ &nbsp; &nbsp; ▼ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp;folio &nbsp;folio folio folio folio &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; &nbsp;A &nbsp; &nbsp; B &nbsp; &nbsp; C &nbsp; &nbsp; D &nbsp; &nbsp; E &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
└─────────────────────────────────────────┘

执行:&nbsp;truncate_inode_pages_range(mapping,&nbsp;4096,&nbsp;12287)
范围: [1K,&nbsp;12K) = 页号 [1,&nbsp;3)

截断后:
┌─────────────────────────────────────────┐
│ &nbsp;address_space->i_pages&nbsp;(XArray) &nbsp; &nbsp; &nbsp; │
│ &nbsp;┌─────┬─────┬─────┬─────┬─────┐ &nbsp; &nbsp; &nbsp;│
│ &nbsp;│ &nbsp;0&nbsp; │NULL&nbsp;│NULL&nbsp;│NULL&nbsp;│ &nbsp;4&nbsp; │ &nbsp; &nbsp; &nbsp;│
│ &nbsp;└─────┴─────┴─────┴─────┴─────┘ &nbsp; &nbsp; &nbsp;│
│ &nbsp; &nbsp;│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp;▼ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp;folio &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp;A &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; │
│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;[已删除] &nbsp; &nbsp;[已删除] &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;folio B &nbsp; &nbsp;folio C &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
│ &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;folio D &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;│
└─────────────────────────────────────────┘

操作:
1.&nbsp;folio B, C, D 从 XArray 中删除(xas_store(&xas,&nbsp;NULL))
2.&nbsp;folio->mapping =&nbsp;NULL(断开关联)
3.&nbsp;mapping->nrpages -=&nbsp;3(更新统计)
4.&nbsp;folio 引用计数减&nbsp;1,如果为&nbsp;0&nbsp;则释放到伙伴系统

#

unmap_mapping_pages
voidunmap_mapping_pages(struct&nbsp;address_space *mapping,&nbsp;pgoff_t&nbsp;start,
pgoff_t&nbsp;nr,&nbsp;bool&nbsp;even_cows){
struct&nbsp;zap_details&nbsp;details = { };
pgoff_t&nbsp;first_index = start;
pgoff_t&nbsp;last_index = start + nr -&nbsp;1;

&nbsp; &nbsp; details.even_cows = even_cows; &nbsp;// 是否取消映射 COW 页面
if&nbsp;(last_index < first_index)
&nbsp; &nbsp; &nbsp; &nbsp; last_index = ULONG_MAX;

// 获取映射锁(读锁,允许多个读者)
i_mmap_lock_read(mapping);
// 如果 i_mmap 树不为空,遍历并取消映射
if&nbsp;(unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;last_index, &details);
i_mmap_unlock_read(mapping);
}

#

shmem_undo_range (核心截断实现)
  • 第一阶段:批量查找并截断完整 folio
  • 第二阶段:处理部分页(跨页边界)
  • 第三阶段:处理剩余完整 folio
  • 释放交换空间并更新统计
staticvoidshmem_undo_range(struct&nbsp;inode *inode,&nbsp;loff_t&nbsp;lstart,&nbsp;loff_t&nbsp;lend,
bool&nbsp;unfalloc){
struct&nbsp;address_space&nbsp;*mapping = inode->i_mapping;
struct&nbsp;shmem_inode_info&nbsp;*info =&nbsp;SHMEM_I(inode);
// 将字节偏移转换为页号
pgoff_t&nbsp;start = (lstart + PAGE_SIZE -&nbsp;1) >> PAGE_SHIFT;
pgoff_t&nbsp;end = (lend +&nbsp;1) >> PAGE_SHIFT;
struct&nbsp;folio_batch&nbsp;fbatch;
pgoff_t&nbsp;indices[PAGEVEC_SIZE];
struct&nbsp;folio&nbsp;*folio;
bool&nbsp;same_folio;
long&nbsp;nr_swaps_freed =&nbsp;0;
pgoff_t&nbsp;index;
int&nbsp;i;

if&nbsp;(lend ==&nbsp;-1)
&nbsp; &nbsp; &nbsp; &nbsp; end =&nbsp;-1; &nbsp;&nbsp;/* unsigned, so actually very big */

// 如果是在撤销失败的 fallocate,更新 fallocend
if&nbsp;(info->fallocend > start && info->fallocend <= end && !unfalloc)
&nbsp; &nbsp; &nbsp; &nbsp; info->fallocend = start;

// 初始化 folio 批次
folio_batch_init(&fbatch);
&nbsp; &nbsp; index = start;

// 第一阶段:查找并锁定范围内的所有 folio
while&nbsp;(index < end &&&nbsp;find_lock_entries(mapping, &index, end -&nbsp;1,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &fbatch, indices)) {
for&nbsp;(i =&nbsp;0; i <&nbsp;folio_batch_count(&fbatch); i++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; folio = fbatch.folios[i];

// 如果是交换条目(xa_is_value)
if&nbsp;(xa_is_value(folio)) {
if&nbsp;(unfalloc)
continue;
// 释放交换空间
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; nr_swaps_freed +=&nbsp;shmem_free_swap(mapping,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; indices[i], folio);
continue;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }

// 如果是普通 folio,截断它
if&nbsp;(!unfalloc || !folio_test_uptodate(folio))
truncate_inode_folio(mapping, folio);
folio_unlock(folio);
&nbsp; &nbsp; &nbsp; &nbsp; }
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched(); &nbsp;// 让出 CPU,避免长时间占用
&nbsp; &nbsp; }

// 第二阶段:处理部分页(跨页边界的情况)
if&nbsp;(unfalloc)
goto&nbsp;whole_folios;

&nbsp; &nbsp; same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);

// 处理起始页的部分
&nbsp; &nbsp; folio =&nbsp;shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
if&nbsp;(folio) {
&nbsp; &nbsp; &nbsp; &nbsp; same_folio = lend <&nbsp;folio_pos(folio) +&nbsp;folio_size(folio);
folio_mark_dirty(folio);
// 截断部分 folio
if&nbsp;(!truncate_inode_partial_folio(folio, lstart, lend)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; start =&nbsp;folio_next_index(folio);
if&nbsp;(same_folio)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; end = folio->index;
&nbsp; &nbsp; &nbsp; &nbsp; }
folio_unlock(folio);
folio_put(folio);
&nbsp; &nbsp; &nbsp; &nbsp; folio =&nbsp;NULL;
&nbsp; &nbsp; }

// 处理结束页的部分(如果与起始页不同)
if&nbsp;(!same_folio)
&nbsp; &nbsp; &nbsp; &nbsp; folio =&nbsp;shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
if&nbsp;(folio) {
folio_mark_dirty(folio);
if&nbsp;(!truncate_inode_partial_folio(folio, lstart, lend))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; end = folio->index;
folio_unlock(folio);
folio_put(folio);
&nbsp; &nbsp; }

whole_folios:
// 第三阶段:处理完整的 folio
&nbsp; &nbsp; index = start;
while&nbsp;(index < end) {
cond_resched();

// 查找范围内的 folio
if&nbsp;(!find_get_entries(mapping, &index, end -&nbsp;1, &fbatch,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if&nbsp;(index == start || end !=&nbsp;-1)
break;
/* But if truncating, restart to make sure all gone */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; index = start;
continue;
&nbsp; &nbsp; &nbsp; &nbsp; }

for&nbsp;(i =&nbsp;0; i <&nbsp;folio_batch_count(&fbatch); i++) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; folio = fbatch.folios[i];

// 处理交换条目
if&nbsp;(xa_is_value(folio)) {
long&nbsp;swaps_freed;

if&nbsp;(unfalloc)
continue;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; swaps_freed =&nbsp;shmem_free_swap(mapping, indices[i], folio);
if&nbsp;(!swaps_freed) {
/* Swap was replaced by page: retry */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; index = indices[i];
break;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; nr_swaps_freed += swaps_freed;
continue;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }

folio_lock(folio);

if&nbsp;(!unfalloc || !folio_test_uptodate(folio)) {
if&nbsp;(folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
folio_unlock(folio);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; index = indices[i];
break;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; folio);

// 根据 folio 大小选择不同的截断方式
if&nbsp;(!folio_test_large(folio)) {
// 普通页:直接截断
truncate_inode_folio(mapping, folio);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;if&nbsp;(truncate_inode_partial_folio(folio, lstart, lend)) {
// 大页:部分截断,可能需要分割
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If we split a page, reset the loop so
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* that we pick up the new sub pages.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(!folio_test_large(folio)) {
folio_unlock(folio);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; index = start; &nbsp;// 重置索引,重新处理
break;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
folio_unlock(folio);
&nbsp; &nbsp; &nbsp; &nbsp; }
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
&nbsp; &nbsp; }

// 重新计算 inode 的统计信息(释放的交换空间)
shmem_recalc_inode(inode,&nbsp;0, -nr_swaps_freed);
}

#

06

访问空洞地址

我们主要研究 shmem 的情况,就用题目中的 copy_from_user 举列子

  • 文件系统(如 ext4)通过检查元数据,发现该区域没有分配****物理块
  • 不会发起真正的磁盘I/O去读取数据
  • 直接返回全零(0x00)给调用者

流程图

打洞期间

用户空间: copy_from_user(dst, src, size)
&nbsp; &nbsp; ↓
内核空间: copy_from_user() [include/linux/uaccess.h:205]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; _copy_from_user() / _inline_copy_from_user()
&nbsp; &nbsp; ↓
&nbsp; &nbsp; raw_copy_from_user() [arch/x86/include/asm/uaccess_64.h:139]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; copy_user_generic() [汇编实现,使用 rep movsb]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; [CPU 触发&nbsp;#PF 异常,因为访问了未映射的页面]
&nbsp; &nbsp; ↓
中断处理: exc_page_fault() [arch/x86/mm/fault.c:1493]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; handle_page_fault() [arch/x86/mm/fault.c:1469]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; do_user_addr_fault() [arch/x86/mm/fault.c:1211]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; handle_mm_fault() [mm/memory.c:6046]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; __handle_mm_fault() [mm/memory.c:5820]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; handle_pte_fault() [mm/memory.c:5736]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; do_pte_missing() [mm/memory.c:3959]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; do_fault() [mm/memory.c:5405]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; do_read_fault() [mm/memory.c:5281] &nbsp;(因为是读操作)
&nbsp; &nbsp; ↓
&nbsp; &nbsp; __do_fault() [mm/memory.c:4854]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; vma->vm_ops->fault() &nbsp;[对于 shmem 文件,指向 shmem_fault]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; shmem_fault() [mm/shmem.c:2515]
&nbsp; &nbsp; ↓
&nbsp; &nbsp; shmem_falloc_wait() [mm/shmem.c:2474] &nbsp;← 关键:检查是否正在打洞
&nbsp; &nbsp; ↓
&nbsp; &nbsp; schedule() &nbsp;← 线程被阻塞,等待打洞完成

打洞后

shmem_falloc_wait() 返回&nbsp;VM_FAULT_RETRY
&nbsp; &nbsp; ↓
向上传播到 do_user_addr_fault()
&nbsp; &nbsp; ↓
检测到 fault &&nbsp;VM_FAULT_RETRY
&nbsp; &nbsp; ↓
设置 flags |= FAULT_FLAG_TRIED
&nbsp; &nbsp; ↓
goto&nbsp;retry&nbsp;- 跳回&nbsp;retry&nbsp;标签
&nbsp; &nbsp; ↓
重新获取 VMA 锁 (lock_mm_and_find_vma)
&nbsp; &nbsp; ↓
重新调用 handle_mm_fault()
&nbsp; &nbsp; ↓
再次进入 shmem_fault()
&nbsp; &nbsp; ↓
这次 inode->i_private 已经为空(打洞完成)
&nbsp; &nbsp; ↓
正常调用 shmem_get_folio_gfp() 分配页面
&nbsp; &nbsp; ↓
完成页面映射,返回成功

copy_from_user -> raw_copy_from_user

  • 检查复制大小,调用底层复制函数
  • 在 x86_64 上通常走 _inline_copy_from_user(),最终调用 raw_copy_from_user()
static&nbsp;__always_inline&nbsp;unsignedlong&nbsp;__must_check
copy_from_user(void&nbsp;*to,&nbsp;constvoid&nbsp;__user *from,&nbsp;unsignedlong&nbsp;n){
if&nbsp;(!check_copy_size(to, n,&nbsp;false))
return&nbsp;n;
#ifdef&nbsp;INLINE_COPY_FROM_USER
return&nbsp;_inline_copy_from_user(to, from, n);
#else
return&nbsp;_copy_from_user(to, from, n);
#endif
}

raw_copy_from_user()

x86_64 实现

  • 调用 copy_user_generic()(汇编实现,使用 rep movsb)
  • 访问用户空间地址时,若页面未映射或不可访问,CPU 触发 #PF 异常
static&nbsp;__always_inline __must_check&nbsp;unsignedlong
raw_copy_from_user(void&nbsp;*dst,&nbsp;constvoid&nbsp;__user *src,&nbsp;unsignedlong&nbsp;size){
return&nbsp;copy_user_generic(dst, (__force&nbsp;void&nbsp;*)src, size);
}

exc_page_fault()

  • 从 CR2 读取触发异常的地址
  • 调用 handle_page_fault() 处理
DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
{
irqentry_state_t&nbsp;state;
unsignedlong&nbsp;address;

&nbsp; &nbsp; &nbsp; &nbsp; address =&nbsp;cpu_feature_enabled(X86_FEATURE_FRED) ?&nbsp;fred_event_data(regs) :&nbsp;read_cr2();

prefetchw(¤t->mm->mmap_lock);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* KVM uses #PF vector to deliver 'page not present' events to guests
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* (asynchronous page fault mechanism). The event happens when a
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* userspace task is trying to access some valid (from guest's point of
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* view) memory which is not currently mapped by the host (e.g. the
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* memory is swapped out). Note, the corresponding "page ready" event
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* which is injected when the memory becomes available, is delivered via
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* an interrupt mechanism and not a #PF exception
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* We are relying on the interrupted context being sane (valid RSP,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* relevant locks not held, etc.), which is fine as long as the
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* interrupted context had IF=1. &nbsp;We are also relying on the KVM
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* async pf type field and CR2 being read consistently instead of
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* getting values from real and async page faults mixed up.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Fingers crossed.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* The async #PF handling code takes care of idtentry handling
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* itself.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(kvm_handle_async_pf(regs, (u32)address))
return;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Entry handling for valid #PF from kernel mode is slightly
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* different: RCU is already watching and ct_irq_enter() must not
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* be invoked because a kernel fault on a user space address might
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* sleep.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* In case the fault hit a RCU idle region the conditional entry
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* code reenabled RCU to avoid subsequent wreckage which helps
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* debuggability.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; state =&nbsp;irqentry_enter(regs);

instrumentation_begin();
handle_page_fault(regs, error_code, address);
instrumentation_end();

irqentry_exit(regs, state);
}

do_user_addr_fault()

如果返回 VM_FAULT_RETRY 标签就会重新尝试访问。重新调用

  • 设置 FAULT_FLAG_USER 和 FAULT_FLAG_WRITE 等标志
  • 查找并锁定 VMA
  • 调用 handle_mm_fault() 处理
static&nbsp;inline
void do_user_addr_fault(struct pt_regs *regs,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; unsigned long error_code,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; unsigned long address)
{
&nbsp; &nbsp; &nbsp; &nbsp; struct vm_area_struct *vma;
&nbsp; &nbsp; &nbsp; &nbsp; struct task_struct *tsk;
&nbsp; &nbsp; &nbsp; &nbsp; struct mm_struct *mm;
&nbsp; &nbsp; &nbsp; &nbsp; vm_fault_t fault;
&nbsp; &nbsp; &nbsp; &nbsp; unsigned int flags = FAULT_FLAG_DEFAULT;

&nbsp; &nbsp; &nbsp; &nbsp; tsk = current;
&nbsp; &nbsp; &nbsp; &nbsp; mm = tsk->mm;

if&nbsp;(unlikely((error_code & (X86_PF_USER | X86_PF_INSTR)) == X86_PF_INSTR)) {
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Whoops, this is kernel mode code trying to execute from
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* user memory. &nbsp;Unless this is AMD erratum #93, which
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* corrupts RIP such that it looks like a user address,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* this is unrecoverable. &nbsp;Don't even try to look up the
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* VMA or look for extable entries.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(is_errata93(regs, address))
return;

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; page_fault_oops(regs, error_code, address);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

/* kprobes don't want to hook the spurious faults: */
if&nbsp;(WARN_ON_ONCE(kprobe_page_fault(regs, X86_TRAP_PF)))
return;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Reserved bits are never expected to be set on
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* entries in the user portion of the page tables.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(unlikely(error_code & X86_PF_RSVD))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pgtable_bad(regs, error_code, address);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If SMAP is on, check for invalid kernel (supervisor) access to user
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* pages in the user address space. &nbsp;The odd case here is WRUSS,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* which, according to the preliminary documentation, does not respect
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* SMAP and will have the USER bit set so, in all cases, SMAP
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* enforcement appears to be consistent with the USER bit.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;!(error_code & X86_PF_USER) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;!(regs->flags & X86_EFLAGS_AC))) {
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* No extable entry here. &nbsp;This was a kernel access to an
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* invalid pointer. &nbsp;get_kernel_nofault() will not get here.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; page_fault_oops(regs, error_code, address);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If we're in an interrupt, have no user context or are running
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* in a region with pagefaults disabled then we must not take the fault
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(unlikely(faulthandler_disabled() || !mm)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; bad_area_nosemaphore(regs, error_code, address);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

/* Legacy check - remove this after verifying that it doesn't trigger */
if&nbsp;(WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; bad_area_nosemaphore(regs, error_code, address);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

&nbsp; &nbsp; &nbsp; &nbsp; local_irq_enable();

&nbsp; &nbsp; &nbsp; &nbsp; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS,&nbsp;1, regs, address);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Read-only permissions can not be expressed in shadow stack PTEs.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Treat all shadow stack accesses as WRITE faults. This ensures
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* that the MM will prepare everything (e.g., break COW) such that
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* maybe_mkwrite() can create a proper shadow stack PTE.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(error_code & X86_PF_SHSTK)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; flags |= FAULT_FLAG_WRITE;
if&nbsp;(error_code & X86_PF_WRITE)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; flags |= FAULT_FLAG_WRITE;
if&nbsp;(error_code & X86_PF_INSTR)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; flags |= FAULT_FLAG_INSTRUCTION;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* We set FAULT_FLAG_USER based on the register state, not
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* based on X86_PF_USER. User space accesses that cause
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* system page faults are still user accesses.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(user_mode(regs))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; flags |= FAULT_FLAG_USER;

#ifdef CONFIG_X86_64
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Faults in the vsyscall page might need emulation. &nbsp;The
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* vsyscall page is at a high address (>PAGE_OFFSET), but is
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* considered to be part of the user address space.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* The vsyscall page does not have a "real" VMA, so do this
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* emulation before we go searching for VMAs.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* PKRU never rejects instruction fetches, so we don't need
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* to consider the PF_PK bit.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(is_vsyscall_vaddr(address)) {
if&nbsp;(emulate_vsyscall(error_code, regs, address))
return;
&nbsp; &nbsp; &nbsp; &nbsp; }
#endif

if&nbsp;(!(flags & FAULT_FLAG_USER))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto lock_mmap;

&nbsp; &nbsp; &nbsp; &nbsp; vma = lock_vma_under_rcu(mm, address);
if&nbsp;(!vma)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto lock_mmap;

if&nbsp;(unlikely(access_error(error_code, vma))) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; bad_area_access_error(regs, error_code, address, NULL, vma);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
if&nbsp;(!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vma_end_read(vma);

if&nbsp;(!(fault & VM_FAULT_RETRY)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto done;
&nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; count_vm_vma_lock_event(VMA_LOCK_RETRY);
if&nbsp;(fault & VM_FAULT_MAJOR)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; flags |= FAULT_FLAG_TRIED;

/* Quick path to respond to signals */
if&nbsp;(fault_signal_pending(fault, regs)) {
if&nbsp;(!user_mode(regs))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; kernelmode_fixup_or_oops(regs, error_code, address,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;SIGBUS, BUS_ADRERR,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;ARCH_DEFAULT_PKEY);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }
lock_mmap:

retry:
&nbsp; &nbsp; &nbsp; &nbsp; vma = lock_mm_and_find_vma(mm, address, regs);
if&nbsp;(unlikely(!vma)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; bad_area_nosemaphore(regs, error_code, address);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Ok, we have a good vm_area for this memory access, so
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* we can handle it..
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(unlikely(access_error(error_code, vma))) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; bad_area_access_error(regs, error_code, address, mm, vma);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If for any reason at all we couldn't handle the fault,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* make sure we exit gracefully rather than endlessly redo
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* the fault. &nbsp;Since we never set FAULT_FLAG_RETRY_NOWAIT, if
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Note that handle_userfault() may also release and reacquire mmap_lock
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* (and not return with VM_FAULT_RETRY), when returning to userland to
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* repeat the page fault later with a VM_FAULT_NOPAGE retval
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* (potentially after handling any pending signal during the return to
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* userland). The return to userland is identified whenever
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; fault = handle_mm_fault(vma, address, flags, regs);

if&nbsp;(fault_signal_pending(fault, regs)) {
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Quick path to respond to signals. &nbsp;The core mm code
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* has unlocked the mm for us if we get here.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(!user_mode(regs))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; kernelmode_fixup_or_oops(regs, error_code, address,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;SIGBUS, BUS_ADRERR,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;ARCH_DEFAULT_PKEY);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

/* The fault is fully completed (including releasing mmap lock) */
if&nbsp;(fault & VM_FAULT_COMPLETED)
return;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* If we need to retry the mmap_lock has already been released,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* and if there is a fatal signal pending there is no guarantee
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* that we made any progress. Handle this case first.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(unlikely(fault & VM_FAULT_RETRY)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; flags |= FAULT_FLAG_TRIED;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto retry;
&nbsp; &nbsp; &nbsp; &nbsp; }

&nbsp; &nbsp; &nbsp; &nbsp; mmap_read_unlock(mm);
done:
if&nbsp;(likely(!(fault & VM_FAULT_ERROR)))
return;

if&nbsp;(fatal_signal_pending(current) && !user_mode(regs)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; kernelmode_fixup_or_oops(regs, error_code, address,
0,&nbsp;0, ARCH_DEFAULT_PKEY);
return;
&nbsp; &nbsp; &nbsp; &nbsp; }

if&nbsp;(fault & VM_FAULT_OOM) {
/* Kernel mode? Handle exceptions or die: */
if&nbsp;(!user_mode(regs)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; kernelmode_fixup_or_oops(regs, error_code, address,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;SIGSEGV, SEGV_MAPERR,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;ARCH_DEFAULT_PKEY);
return;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* We ran out of memory, call the OOM killer, and return the
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* userspace (which will retry the fault, or kill us if we got
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* oom-killed):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pagefault_out_of_memory();
&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
if&nbsp;(fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;VM_FAULT_HWPOISON_LARGE))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; do_sigbus(regs, error_code, address, fault);
else&nbsp;if&nbsp;(fault & VM_FAULT_SIGSEGV)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; bad_area_nosemaphore(regs, error_code, address);
else
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; BUG();
&nbsp; &nbsp; &nbsp; &nbsp; }
}
NOKPROBE_SYMBOL(do_user_addr_fault);

handle_mm_fault(页错误处理入口)

CPU 硬件检测
&nbsp; &nbsp; ↓ (触发 #PF 异常)
软件层页表遍历 (__handle_mm_fault)
&nbsp; &nbsp; ├─ PGD → P4D → PUD → PMD → PTE
&nbsp; &nbsp; ├─ 检查大页 (PUD/PMD 级别)
&nbsp; &nbsp; └─ 都不是大页 →&nbsp;handle_pte_fault()
&nbsp; &nbsp; &nbsp; &nbsp; ↓
handle_pte_fault() 检查 PTE
&nbsp; &nbsp; ├─&nbsp;pte_none() →&nbsp;do_pte_missing() →&nbsp;do_fault() →&nbsp;shmem_fault()
&nbsp; &nbsp; ├─ !pte_present() →&nbsp;do_swap_page() (页面在 swap)
&nbsp; &nbsp; └─&nbsp;pte_present() → 处理权限问题或更新访问标志
  • 处理内存控制组(memcg)和 LRU
  • 调用 __handle_mm_fault() 进行实际处理
/**
&nbsp;* handle_mm_fault - 页错误处理的主入口函数
&nbsp;* @vma: 发生页错误的虚拟内存区域
&nbsp;* @address: 触发页错误的地址
&nbsp;* @flags: 页错误标志(FAULT_FLAG_USER, FAULT_FLAG_WRITE 等)
&nbsp;* @regs: 寄存器状态(可选,用于统计)
&nbsp;*
&nbsp;* 功能说明:
&nbsp;* &nbsp; 这是内存管理子系统处理页错误的主要入口点。
&nbsp;* &nbsp; 在 do_user_addr_fault() 中找到 VMA 后,会调用此函数。
&nbsp;*
&nbsp;* 处理流程:
&nbsp;* &nbsp; 1. 验证 VMA 访问权限
&nbsp;* &nbsp; 2. 处理内存控制组(memcg)和 LRU 相关逻辑
&nbsp;* &nbsp; 3. 区分大页(hugetlb)和普通页,分别处理
&nbsp;* &nbsp; 4. 对于普通页,调用 __handle_mm_fault() 进行页表遍历和 PTE 处理
&nbsp;*
&nbsp;* 返回值:
&nbsp;* &nbsp; VM_FAULT_* 系列返回值,表示页错误处理结果
&nbsp;*
&nbsp;* 注意:
&nbsp;* &nbsp; 函数返回后,vma 可能已经被释放(如果 mmap_lock 被释放),
&nbsp;* &nbsp; 因此不能再访问 vma 指针。
&nbsp;*/
vm_fault_thandle_mm_fault(struct&nbsp;vm_area_struct *vma,&nbsp;unsignedlong&nbsp;address,
unsignedint&nbsp;flags,&nbsp;struct&nbsp;pt_regs *regs){
/* If the fault handler drops the mmap_lock, vma may be freed */
struct&nbsp;mm_struct&nbsp;*mm = vma->vm_mm;
vm_fault_t&nbsp;ret;
bool&nbsp;is_droppable;

&nbsp; &nbsp; &nbsp; &nbsp; __set_current_state(TASK_RUNNING);

&nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;sanitize_fault_flags(vma, &flags);
if&nbsp;(ret)
goto&nbsp;out;

if&nbsp;(!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;flags & FAULT_FLAG_INSTRUCTION,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;flags & FAULT_FLAG_REMOTE)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_SIGSEGV;
goto&nbsp;out;
&nbsp; &nbsp; &nbsp; &nbsp; }

&nbsp; &nbsp; &nbsp; &nbsp; is_droppable = !!(vma->vm_flags & VM_DROPPABLE);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Enable the memcg OOM handling for faults triggered in user
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* space. &nbsp;Kernel faults are handled more gracefully.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();

lru_gen_enter_fault(vma);

if&nbsp;(unlikely(is_vm_hugetlb_page(vma)))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;hugetlb_fault(vma->vm_mm, vma, address, flags);
else
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = __handle_mm_fault(vma, address, flags);

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Warning: It is no longer safe to dereference vma-> after this point,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* because mmap_lock might have been dropped by __handle_mm_fault(), so
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* vma might be destroyed from underneath us.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/

lru_gen_exit_fault();

/* If the mapping is droppable, then errors due to OOM aren't fatal. */
if&nbsp;(is_droppable)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret &= ~VM_FAULT_OOM;

if&nbsp;(flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* The task may have entered a memcg OOM situation but
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* if the allocation error was handled gracefully (no
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* VM_FAULT_OOM), there is no need to kill anything.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Just clean up the OOM state peacefully.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
&nbsp; &nbsp; &nbsp; &nbsp; }
out:
mm_account_fault(mm, regs, address, flags, ret);

return&nbsp;ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

__handle_mm_fault

/**
&nbsp;* __handle_mm_fault - 页表遍历和页错误处理的核心函数
&nbsp;*&nbsp;@vma: 发生页错误的虚拟内存区域
&nbsp;*&nbsp;@address: 触发页错误的地址
&nbsp;*&nbsp;@flags: 页错误标志
&nbsp;*
&nbsp;* 功能说明:
&nbsp;* &nbsp; 这是页错误处理的核心函数,负责遍历页表结构并处理不同级别的页错误。
&nbsp;* &nbsp; 对于 punch hole 场景,最终会遍历到 PTE 级别,发现 PTE 为空后调用
&nbsp;* &nbsp; handle_pte_fault() 处理。
&nbsp;*
&nbsp;* 页表遍历流程(x86_64 五级页表):
&nbsp;* &nbsp; PGD (Page Global Directory)
&nbsp;* &nbsp; &nbsp; ↓
&nbsp;* &nbsp; P4D (Page 4th Directory)
&nbsp;* &nbsp; &nbsp; ↓
&nbsp;* &nbsp; PUD (Page Upper Directory) - 检查是否为大页(1GB)
&nbsp;* &nbsp; &nbsp; ↓
&nbsp;* &nbsp; PMD (Page Middle Directory) - 检查是否为大页(2MB)
&nbsp;* &nbsp; &nbsp; ↓
&nbsp;* &nbsp; PTE (Page Table Entry) - 普通页(4KB)
&nbsp;*
&nbsp;* 处理逻辑:
&nbsp;* &nbsp; 1. 逐级分配页表项(如果不存在)
&nbsp;* &nbsp; 2. 检查每一级是否为大页(THP),如果是则特殊处理
&nbsp;* &nbsp; 3. 如果都不是大页,继续到下一级
&nbsp;* &nbsp; 4. 最终到达 PTE 级别,调用 handle_pte_fault() 处理
&nbsp;*
&nbsp;* 对于 punch hole 场景:
&nbsp;* &nbsp; - PTE 为空(pte_none),表示页面未映射
&nbsp;* &nbsp; - handle_pte_fault() 会检测到并调用 do_pte_missing()
&nbsp;* &nbsp; - 最终调用文件系统的 fault 处理函数(如 shmem_fault)
&nbsp;*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; unsigned long address, unsigned int flags)
{
&nbsp; &nbsp; &nbsp; &nbsp; struct vm_fault vmf = {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; .vma = vma,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; .address = address & PAGE_MASK,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; .real_address = address,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; .flags = flags,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; .pgoff = linear_page_index(vma, address),
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; .gfp_mask = __get_fault_gfp_mask(vma),
&nbsp; &nbsp; &nbsp; &nbsp; };
&nbsp; &nbsp; &nbsp; &nbsp; struct mm_struct *mm = vma->vm_mm;
&nbsp; &nbsp; &nbsp; &nbsp; unsigned long vm_flags = vma->vm_flags;
&nbsp; &nbsp; &nbsp; &nbsp; pgd_t *pgd;
&nbsp; &nbsp; &nbsp; &nbsp; p4d_t *p4d;
&nbsp; &nbsp; &nbsp; &nbsp; vm_fault_t ret;

/* 步骤1: 遍历 PGD 和 P4D */
&nbsp; &nbsp; &nbsp; &nbsp; pgd = pgd_offset(mm, address);
&nbsp; &nbsp; &nbsp; &nbsp; p4d = p4d_alloc(mm, pgd, address);
if&nbsp;(!p4d)
return&nbsp;VM_FAULT_OOM;

/* 步骤2: 遍历 PUD,检查是否为大页(1GB) */
&nbsp; &nbsp; &nbsp; &nbsp; vmf.pud = pud_alloc(mm, p4d, address);
if&nbsp;(!vmf.pud)
return&nbsp;VM_FAULT_OOM;
retry_pud:
if&nbsp;(pud_none(*vmf.pud) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; thp_vma_allowable_order(vma, vm_flags,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = create_huge_pud(&vmf);
if&nbsp;(!(ret & VM_FAULT_FALLBACK))
return&nbsp;ret;
&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pud_t orig_pud = *vmf.pud;

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; barrier();
if&nbsp;(pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* TODO once we support anonymous PUDs: NUMA case and
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* FAULT_FLAG_UNSHARE handling.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;((flags & FAULT_FLAG_WRITE) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; !pud_write(orig_pud)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = wp_huge_pud(&vmf, orig_pud);
if&nbsp;(!(ret & VM_FAULT_FALLBACK))
return&nbsp;ret;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; huge_pud_set_accessed(&vmf, orig_pud);
return&nbsp;0;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; }

/* 步骤3: 遍历 PMD,检查是否为大页(2MB) */
&nbsp; &nbsp; &nbsp; &nbsp; vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if&nbsp;(!vmf.pmd)
return&nbsp;VM_FAULT_OOM;

/* Huge pud page fault raced with pmd_alloc? */
if&nbsp;(pud_trans_unstable(vmf.pud))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto retry_pud;

if&nbsp;(pmd_none(*vmf.pmd) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; thp_vma_allowable_order(vma, vm_flags,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = create_huge_pmd(&vmf);
if&nbsp;(!(ret & VM_FAULT_FALLBACK))
return&nbsp;ret;
&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);

if&nbsp;(unlikely(is_swap_pmd(vmf.orig_pmd))) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; VM_BUG_ON(thp_migration_supported() &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; !is_pmd_migration_entry(vmf.orig_pmd));
if&nbsp;(is_pmd_migration_entry(vmf.orig_pmd))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pmd_migration_entry_wait(mm, vmf.pmd);
return&nbsp;0;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
if&nbsp;(pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
if&nbsp;(pmd_protnone(vmf.orig_pmd) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vma_is_accessible(vma))
return&nbsp;do_huge_pmd_numa_page(&vmf);

if&nbsp;((flags & (FAULT_FLAG_WRITE | FAULT_FLAG_UNSHARE)) &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; !pmd_write(vmf.orig_pmd)) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = wp_huge_pmd(&vmf);
if&nbsp;(!(ret & VM_FAULT_FALLBACK))
return&nbsp;ret;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; huge_pmd_set_accessed(&vmf);
return&nbsp;0;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; }

/* 步骤4: 都不是大页,处理普通页(4KB),检查 PTE */
return&nbsp;handle_pte_fault(&vmf);
}

handle_pte_fault

  • 获取 pte 检查是否为空(pte_none),如果为空则调用 do_pte_missing()
/**
&nbsp;* handle_pte_fault - 处理 PTE(页表项)级别的页错误
&nbsp;* @vmf: 页错误描述结构体
&nbsp;*
&nbsp;* 功能说明:
&nbsp;* &nbsp; 这是页表遍历的最后一级,负责检查和处理 PTE 的状态。
&nbsp;* &nbsp; 对于 punch hole 场景,PTE 为空(pte_none),表示页面未映射。
&nbsp;*
&nbsp;* PTE 状态检查和处理:
&nbsp;* &nbsp; 1. pte_none() - PTE 为空(页面未映射)
&nbsp;* &nbsp; &nbsp; &nbsp;→ 调用 do_pte_missing() → do_fault() → shmem_fault()
&nbsp;* &nbsp; &nbsp; &nbsp;→ 这是 punch hole 场景的路径!
&nbsp;*
&nbsp;* &nbsp; 2. !pte_present() - PTE 存在但不是 present(页面在 swap)
&nbsp;* &nbsp; &nbsp; &nbsp;→ 调用 do_swap_page() 从 swap 换入页面
&nbsp;*
&nbsp;* &nbsp; 3. pte_protnone() - PTE 存在但权限不足(NUMA 页面)
&nbsp;* &nbsp; &nbsp; &nbsp;→ 调用 do_numa_page() 处理 NUMA 迁移
&nbsp;*
&nbsp;* &nbsp; 4. pte_present() - PTE 存在且有效
&nbsp;* &nbsp; &nbsp; &nbsp;→ 检查写权限,处理写时复制(COW)或更新访问标志
&nbsp;*
&nbsp;* 对于 punch hole 攻击:
&nbsp;* &nbsp; 当访问被 fallocate PUNCH_HOLE 打洞的地址时:
&nbsp;* &nbsp; - PTE 为空(pte_none),vmf->pte 被设置为 NULL
&nbsp;* &nbsp; - 调用 do_pte_missing() 处理缺失的页面
&nbsp;* &nbsp; - 最终会调用文件系统的 fault 处理函数
&nbsp;*/
static&nbsp;vm_fault_t&nbsp;handle_pte_fault(struct&nbsp;vm_fault&nbsp;*vmf)
{
&nbsp; &nbsp; &nbsp; &nbsp; pte_t entry;

if&nbsp;(unlikely(pmd_none(*vmf->pmd))) {
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Leave __pte_alloc() until later: because vm_ops->fault may
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* want to allocate huge page, and if we expose page table
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* for an instant, it will be difficult to retract from
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* concurrent faults and from rmap lookups.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->pte = NULL;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* A regular pmd is established and it can't morph into a huge
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* pmd by anon khugepaged, since that takes mmap_lock in write
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* mode; but shmem or file collapse to THP could still morph
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* it into a huge pmd: just retry later if so.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
/* 获取 PTE 指针 */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->pte =&nbsp;pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;vmf->address, &vmf->ptl);
if&nbsp;(unlikely(!vmf->pte))
return&nbsp;0;
/* 读取 PTE 的值(无锁读取) */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->orig_pte =&nbsp;ptep_get_lockless(vmf->pte);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

/* 关键检查:如果 PTE 为空(页面未映射,例如被 punch hole) */
if&nbsp;(pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->pte = NULL;&nbsp;/* 标记 PTE 不存在 */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; }

/* 情况1: PTE 为空(页面未映射)- punch hole 场景走这里 */
if&nbsp;(!vmf->pte)
return&nbsp;do_pte_missing(vmf);

/* 情况2: PTE 存在但不是 present(页面在 swap 中) */
if&nbsp;(!pte_present(vmf->orig_pte))
return&nbsp;do_swap_page(vmf);

/* 情况3: PTE 存在但权限不足(NUMA 页面) */
if&nbsp;(pte_protnone(vmf->orig_pte) &&&nbsp;vma_is_accessible(vmf->vma))
return&nbsp;do_numa_page(vmf);

spin_lock(vmf->ptl);
&nbsp; &nbsp; &nbsp; &nbsp; entry = vmf->orig_pte;
if&nbsp;(unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto unlock;
&nbsp; &nbsp; &nbsp; &nbsp; }
if&nbsp;(vmf->flags & (FAULT_FLAG_WRITE | FAULT_FLAG_UNSHARE)) {
if&nbsp;(!pte_write(entry))
return&nbsp;do_wp_page(vmf);
else&nbsp;if&nbsp;(likely(vmf->flags & FAULT_FLAG_WRITE))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; entry =&nbsp;pte_mkdirty(entry);
&nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; entry =&nbsp;pte_mkyoung(entry);
if&nbsp;(ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache_range(vmf, vmf->vma, vmf->address, vmf->pte,
1);
&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;{
/* Skip spurious TLB flush for retried page fault */
if&nbsp;(vmf->flags & FAULT_FLAG_TRIED)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; goto unlock;
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* This is needed only for protection faults but the arch code
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* is not yet telling us if this is a protection fault or not.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* This still avoids useless tlb flushes for .text page faults
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* with threads.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;vmf->pte);
&nbsp; &nbsp; &nbsp; &nbsp; }
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return&nbsp;0;
}

do_pte_missing

  • 若为 swap,调用 do_swap_page()
  • 对于 shmem 文件映射,通常走 do_pte_missing[do_pte_missing() → do_fault()]
/**
&nbsp;*&nbsp;do_pte_missing - 处理 PTE 为空(页面未映射)的情况
&nbsp;*&nbsp;@vmf: 页错误描述结构体
&nbsp;*
&nbsp;* 功能说明:
&nbsp;*&nbsp; &nbsp;当 handle_pte_fault() 检测到 PTE 为空时调用此函数。
&nbsp;*&nbsp; &nbsp;这是 punch hole 攻击场景的关键路径。
&nbsp;*
&nbsp;* 处理逻辑:
&nbsp;*&nbsp; &nbsp;- 匿名 VMA(匿名内存映射):调用 do_anonymous_page() 分配新页面
&nbsp;*&nbsp; &nbsp;- 文件映射 VMA(如 shmem 文件):调用 do_fault() 处理文件页错误
&nbsp;*
&nbsp;* 对于 punch hole 场景:
&nbsp;* &nbsp; - shmem 文件映射走 do_fault() 路径
&nbsp;*&nbsp; &nbsp;- 最终会调用 shmem_fault(),检测到正在打洞时会阻塞等待
&nbsp;*/
static vm_fault_t do_pte_missing(struct vm_fault&nbsp;*vmf)
{
&nbsp; &nbsp; &nbsp; &nbsp; if (vma_is_anonymous(vmf->vma))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return do_anonymous_page(vmf);
&nbsp; &nbsp; &nbsp; &nbsp; else
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; return do_fault(vmf);
}

do_fault

  • 读操作走 do_read_fault,写操作根据是否共享选择 do_cow_fault 或 do_shared_fault
  • 当然我们这里是走 copy_from_user 是读操作,走 do_read_fault ( do_read_fault -> _do_fault )
/**
&nbsp;* do_fault - 处理文件映射的页错误
&nbsp;* @vmf: 页错误描述结构体
&nbsp;*
&nbsp;* 功能说明:
&nbsp;* &nbsp; 当 do_pte_missing() 检测到是文件映射(非匿名)时调用此函数。
&nbsp;* &nbsp; 根据访问类型(读/写)和 VMA 属性(共享/私有)选择不同的处理路径。
&nbsp;*
&nbsp;* 处理路径:
&nbsp;* &nbsp; 1. 没有 fault 处理函数:返回错误
&nbsp;* &nbsp; 2. 读操作(copy_from_user 是读):
&nbsp;* &nbsp; &nbsp; &nbsp;→ do_read_fault() → __do_fault() → vma->vm_ops->fault()
&nbsp;* &nbsp; &nbsp; &nbsp;→ 对于 shmem: shmem_fault()
&nbsp;* &nbsp; 3. 写操作 + 私有映射(COW):
&nbsp;* &nbsp; &nbsp; &nbsp;→ do_cow_fault() 写时复制
&nbsp;* &nbsp; 4. 写操作 + 共享映射:
&nbsp;* &nbsp; &nbsp; &nbsp;→ do_shared_fault() 共享写入
&nbsp;*
&nbsp;* 对于 punch hole 场景:
&nbsp;* &nbsp; - copy_from_user 是读操作,走 do_read_fault() 路径
&nbsp;* &nbsp; - 最终调用 shmem_fault(),检测到正在打洞时会阻塞
&nbsp;*/
static&nbsp;vm_fault_t&nbsp;do_fault(struct&nbsp;vm_fault&nbsp;*vmf)
{
struct&nbsp;vm_area_struct&nbsp;*vma = vmf->vma;
struct&nbsp;mm_struct&nbsp;*vm_mm = vma->vm_mm;
&nbsp; &nbsp; &nbsp; &nbsp; vm_fault_t ret;

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(!vma->vm_ops->fault) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->pte =&nbsp;pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;vmf->address, &vmf->ptl);
if&nbsp;(unlikely(!vmf->pte))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_SIGBUS;
else&nbsp;{
/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* Make sure this is not a temporary clearing of pte
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* by holding ptl and checking again. A R/M/W update
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* of pte involves: take ptl, clearing the pte so that
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* we don't have concurrent modification by hardware
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* followed by an update.
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
if&nbsp;(unlikely(pte_none(ptep_get(vmf->pte))))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_SIGBUS;
else
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_NOPAGE;

pte_unmap_unlock(vmf->pte, vmf->ptl);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; }&nbsp;else&nbsp;if&nbsp;(!(vmf->flags & FAULT_FLAG_WRITE))
/* 读操作:copy_from_user 走这里 */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;do_read_fault(vmf);
else&nbsp;if&nbsp;(!(vma->vm_flags & VM_SHARED))
/* 写操作 + 私有映射:写时复制 */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;do_cow_fault(vmf);
else
/* 写操作 + 共享映射:共享写入 */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret =&nbsp;do_shared_fault(vmf);

/* preallocated pagetable is unused: free it */
if&nbsp;(vmf->prealloc_pte) {
pte_free(vm_mm, vmf->prealloc_pte);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->prealloc_pte = NULL;
&nbsp; &nbsp; &nbsp; &nbsp; }
return&nbsp;ret;
}

__do_fault -> vma->vm_ops->fault

然后如果我们是 shmem_fault 调用的则指向 shmem_fault

static&nbsp;vm_fault_t __do_fault(struct&nbsp;vm_fault&nbsp;*vmf)
{
struct&nbsp;vm_area_struct&nbsp;*vma = vmf->vma;
struct&nbsp;folio&nbsp;*folio;
&nbsp; &nbsp; vm_fault_t ret;

/*
&nbsp; &nbsp; &nbsp;* Preallocate pte before we take page_lock because this might lead to
&nbsp; &nbsp; &nbsp;* deadlocks for memcg reclaim which waits for pages under writeback:
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;lock_page(A)
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;SetPageWriteback(A)
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;unlock_page(A)
&nbsp; &nbsp; &nbsp;* lock_page(B)
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;lock_page(B)
&nbsp; &nbsp; &nbsp;* pte_alloc_one
&nbsp; &nbsp; &nbsp;* &nbsp; shrink_folio_list
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; wait_on_page_writeback(A)
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;SetPageWriteback(B)
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;unlock_page(B)
&nbsp; &nbsp; &nbsp;* &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;# flush A, B to clear the writeback
&nbsp; &nbsp; &nbsp;*/
if&nbsp;(pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
&nbsp; &nbsp; &nbsp; &nbsp; vmf->prealloc_pte =&nbsp;pte_alloc_one(vma->vm_mm);
if&nbsp;(!vmf->prealloc_pte)
return&nbsp;VM_FAULT_OOM;
&nbsp; &nbsp; }

&nbsp; &nbsp; ret = vma->vm_ops->fault(vmf);
if&nbsp;(unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; VM_FAULT_DONE_COW)))
return&nbsp;ret;

&nbsp; &nbsp; folio =&nbsp;page_folio(vmf->page);
if&nbsp;(unlikely(PageHWPoison(vmf->page))) {
&nbsp; &nbsp; &nbsp; &nbsp; vm_fault_t poisonret = VM_FAULT_HWPOISON;
if&nbsp;(ret & VM_FAULT_LOCKED) {
if&nbsp;(page_mapped(vmf->page))
unmap_mapping_folio(folio);
/* Retry if a clean folio was removed from the cache. */
if&nbsp;(mapping_evict_folio(folio->mapping, folio))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; poisonret = VM_FAULT_NOPAGE;
folio_unlock(folio);
&nbsp; &nbsp; &nbsp; &nbsp; }
folio_put(folio);
&nbsp; &nbsp; &nbsp; &nbsp; vmf->page = NULL;
return&nbsp;poisonret;
&nbsp; &nbsp; }

if&nbsp;(unlikely(!(ret & VM_FAULT_LOCKED)))
folio_lock(folio);
else
VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);

return&nbsp;ret;
}

shmem_fault (检查是否正在打洞)

  • inode->i_private 非空表示正在打洞
  • 调用 shmem_falloc_wait() 等待打洞完成
/**
&nbsp;* shmem_falloc_wait - 等待 fallocate PUNCH_HOLE 操作完成
&nbsp;* @vmf: 页错误描述结构体
&nbsp;* @inode: shmem 文件的 inode
&nbsp;*
&nbsp;* 功能说明:
&nbsp;* &nbsp; 这是 punch hole 攻击的核心函数!
&nbsp;* &nbsp; 当 shmem_fault() 检测到正在打洞时调用此函数。
&nbsp;*
&nbsp;* 处理流程:
&nbsp;* &nbsp; 1. 检查当前页号是否在打洞范围内
&nbsp;* &nbsp; 2. 准备等待队列,设置线程状态为 TASK_UNINTERRUPTIBLE
&nbsp;* &nbsp; 3. 调用 schedule() 让出 CPU,线程被阻塞
&nbsp;* &nbsp; 4. 等待打洞线程完成并调用 wake_up_all() 唤醒
&nbsp;* &nbsp; 5. 清理等待状态,返回 VM_FAULT_RETRY 重试
&nbsp;*
&nbsp;* 攻击利用:
&nbsp;* &nbsp; 在 schedule() 阻塞期间:
&nbsp;* &nbsp; - copy_from_user 所在的线程被阻塞
&nbsp;* &nbsp; - 攻击者可以利用这个时间窗口进行堆操作
&nbsp;* &nbsp; - 例如:在 free 后、置空前进行 UAF 或 Double Free
&nbsp;*
&nbsp;* 时间窗口:
&nbsp;* &nbsp; - 从 schedule() 调用开始,到打洞完成并 wake_up_all() 为止
&nbsp;* &nbsp; - 这个时间窗口足够长,可以进行多次堆操作
&nbsp;*/
static&nbsp;vm_fault_t&nbsp;shmem_falloc_wait(struct&nbsp;vm_fault&nbsp;*vmf,&nbsp;struct&nbsp;inode&nbsp;*inode)
{
struct&nbsp;shmem_falloc&nbsp;*shmem_falloc;
struct&nbsp;file&nbsp;*fpin = NULL;
&nbsp; &nbsp; &nbsp; &nbsp; vm_fault_t ret =&nbsp;0;

spin_lock(&inode->i_lock);
&nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc = inode->i_private;
/* 检查:当前页号是否在打洞范围内 */
if&nbsp;(shmem_falloc && shmem_falloc->waitq &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->pgoff >= shmem_falloc->start &&
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; vmf->pgoff < shmem_falloc->next) {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_NOPAGE;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; fpin =&nbsp;maybe_unlock_mmap_for_io(vmf, NULL);
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc_waitq = shmem_falloc->waitq;
/* 步骤1: 准备等待,设置线程状态为不可中断睡眠 */
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
/* 步骤2: 关键阻塞点 - 让出 CPU,线程被阻塞 */
/* 攻击者可以利用这个时间窗口进行堆操作! */
schedule();

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* shmem_falloc_waitq points into the shmem_fallocate()
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* stack of the hole-punching task: shmem_falloc_waitq
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* is usually invalid by the time we reach here, but
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* finish_wait() does not dereference it in that case;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* though i_lock needed lest racing with wake_up_all().
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
/* 步骤3: 打洞完成,被唤醒,清理等待状态 */
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
&nbsp; &nbsp; &nbsp; &nbsp; }
spin_unlock(&inode->i_lock);
if&nbsp;(fpin) {
fput(fpin);
/* 返回 RETRY,让调用者重试页错误处理 */
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_RETRY;
&nbsp; &nbsp; &nbsp; &nbsp; }
return&nbsp;ret;
}

shmem_falloc_wait (核心)

然后这里会返回 VM_FAULT_RETRY 标签

  • 检查当前页号是否在打洞范围内
  • prepare_to_wait() + schedule() 使线程进入不可中断睡眠
  • 等待期间,内核线程被阻塞,为攻击者提供时间窗口
  • 打洞完成后,wake_up_all() 唤醒等待线程
static&nbsp;vm_fault_t&nbsp;shmem_falloc_wait(struct&nbsp;vm_fault&nbsp;*vmf,&nbsp;struct&nbsp;inode&nbsp;*inode)
{
struct&nbsp;shmem_falloc&nbsp;*shmem_falloc;
struct&nbsp;file&nbsp;*fpin = NULL;
&nbsp; &nbsp; vm_fault_t ret =&nbsp;0;

spin_lock(&inode->i_lock);
&nbsp; &nbsp; shmem_falloc = inode->i_private;
// 关键检查: 确认正在打洞,且当前页在打洞范围内
if&nbsp;(shmem_falloc &&
&nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc->waitq &&
&nbsp; &nbsp; &nbsp; &nbsp; vmf->pgoff >= shmem_falloc->start &&
&nbsp; &nbsp; &nbsp; &nbsp; vmf->pgoff < shmem_falloc->next) {
&nbsp; &nbsp; &nbsp; &nbsp; wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

&nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_NOPAGE;
&nbsp; &nbsp; &nbsp; &nbsp; fpin =&nbsp;maybe_unlock_mmap_for_io(vmf, NULL);
&nbsp; &nbsp; &nbsp; &nbsp; shmem_falloc_waitq = shmem_falloc->waitq;

// 关键步骤1: 准备等待
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);

// 关键步骤2: 调用 schedule() 让出 CPU,线程被阻塞
// 这里就是"慢页面错误"的关键!
// 线程会一直等待,直到打洞完成并调用 wake_up_all()
schedule();

/*
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* shmem_falloc_waitq points into the shmem_fallocate()
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* stack of the hole-punching task: shmem_falloc_waitq
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* is usually invalid by the time we reach here, but
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* finish_wait() does not dereference it in that case;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;* though i_lock needed lest racing with wake_up_all().
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;*/
// 防止在打洞期间将页面错误处理到洞中,避免打洞无法完成
// 使用等待队列而非互斥锁,因为不能在 fault 处理中持有 i_rwsem
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
&nbsp; &nbsp; }
spin_unlock(&inode->i_lock);
if&nbsp;(fpin) {
fput(fpin);
&nbsp; &nbsp; &nbsp; &nbsp; ret = VM_FAULT_RETRY; &nbsp;// 返回 RETRY,让调用者重试
&nbsp; &nbsp; }
return&nbsp;ret;
}

重试

可能有点啰嗦,但是我们需要注意的是当 shmem_falloc_wait 调用后会返回 VM_FAULT_RETRY 给do_user_addr_fault() ,然后重新调用一次 handle_mm_fault .那么我们就可以利用这一点,一直尝试打洞,这样就可以提高利用的成功率。可以发现还是能拖延很多次调用时间的.成功率较大。那么我们利用这个条件竞争就可以比较轻松打出doublefree或者UAF。

07

小trick 任意offset读写

然后也是听 Tplus 大佬那里学到了一个关于 punch hole 实现任意 offset 读写的小trick。感觉特别强,在这里做分享。

前面讲到了 punch hole,我们拥有了延迟 copy_from_user 的能力,那么进一步扩展这个能力我们可以得到什么?

先回到题目中

&nbsp; &nbsp; case&nbsp;322376505:
&nbsp; &nbsp; &nbsp; v8 = allocated_objects[v9.index];
if&nbsp;( !v8 )
return&nbsp;-1;
if&nbsp;( v9.length > 0x7FFFFFFFuLL )
&nbsp; &nbsp; &nbsp; &nbsp; BUG();
return&nbsp;-(copy_from_user(v8, v9.buf, v9.length) !=&nbsp;0);
  • 我们可以看见这里有个写入操作,正常情况下我们理解都是只能obj的头开始写入的。

原理

当然这个题目本身是可以完成DoubleFree这种更好用的原语,但是如果我们非得用UAF的情况下挑战自己有没有什么比较好的办法实现提权捏?那么这里就是关键,我们可以将我们的buf刚好卡在两个Page的中间,让第一个Page是正常的page,第二个page是处于hole中的。

本质是利用 copy_from_user 的非原子性,通过 Fallocate/Hole 制造时间窗口,在 Stall 期间释放旧对象并分配新对象,最终实现“保护头部、修改尾部”的攻击效果。

那么有什么作用?首先就是v8先是我们正常的obj,然后触发copy_from_user正常进行读操作,随后访问到第二个Page的时候,由于是hole所以会触发缺页中断并且等待打洞完成。(如图)

那么这个时候我们快速释放掉这个Obj,并且堆喷上我们的目标结构体,那么此时obj已经变成我们的目标obj了但是copy_from_user的offset不会改变。所以就会直接改写对应offset的内存。

时序图

参考文章

  • https://starlabs.sg/blog/2023/07-a-new-method-for-container-escape-using-file-based-dirtycred/
  • https://blog.csdn.net/wwyyxx26/article/details/154435644

#

看雪ID:Elenia

https://bbs.kanxue.com/user-home-994584.htm

*本文为看雪论坛精华文章,由 Elenia 原创,转载请注明来自看雪社区

往期推荐

逆向分析某手游基于异常的内存保护

解决Il2cppapi混淆,通杀DumpUnityCs文件

记录一次Unity加固的探索与实现

DLINK路由器命令注入漏洞从1DAY到0DAY

量子安全 quantum ctf Global Hyperlink Zone Hack the box

球分享

球点赞

球在看

点击阅读原文查看更多


免责声明:

本文所载程序、技术方法仅面向合法合规的安全研究与教学场景,旨在提升网络安全防护能力,具有明确的技术研究属性。

任何单位或个人未经授权,将本文内容用于攻击、破坏等非法用途的,由此引发的全部法律责任、民事赔偿及连带责任,均由行为人独立承担,本站不承担任何连带责任。

本站内容均为技术交流与知识分享目的发布,若存在版权侵权或其他异议,请通过邮件联系处理,具体联系方式可点击页面上方的联系我

本文转载自:看雪学苑 Elenia Elenia《Linux 内核攻击:Punch hole (2025 Backdoor skernel 复现)》

评论:0   参与:  0