"criu dump重要步骤的深入研究六"

  "criu checkpoint九"

Posted by Xu on June 20, 2017

CRIU,DUMP备份步骤解析

根据前一节对备份步骤的解析,我们分析到单节点备份的parasite_infect_seized部分,该部分实现了对指定pid进程的感染并返回了感染控制单元结构体,随后,也就是这一节将就感染后的进程的相关操作进行解析:从代码的角度看,我们将深入研究如下部分内容:

  • 1.修复寄生进程的vdso模块
  • 2.收集异步io环信息
  • 3.备份杂项设备misc信息
  • 4.收集任务镜像集信息,再根据items中收集的id入口信息把task对象id信息备份到镜像
  • 5.对/proc/pid/fd下的文件描述符进行备份

延续dump_one_task未分析的源码

if (fault_injected(FI_DUMP_EARLY)) {
//对于XEN客户机工作环境,断点会线性降低性能,所以我们目前屏蔽掉断点的功能
		pr_info("fault: CRIU sudden detach\n");
		BUG();
	}

	if (root_ns_mask & CLONE_NEWPID && root_item == item) {
	//这里的CLONE_NEWPID是在clone过程中用于创建一个独立namespace的常见做法的参数信号
		int pfd;

		pfd = parasite_get_proc_fd_seized(parasite_ctl);
		if (pfd < 0) {
			pr_err("Can't get proc fd (pid: %d)\n", pid);
			goto err_cure_imgset;
		}

		if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0)
		//安装服务文件描述符
			goto err_cure_imgset;

		close(pfd);
	}

	ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas);
	//修复寄生进程的vdso模块
	if (ret) {
		pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid);
		goto err_cure_imgset;
	}

	ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */
	//收集异步io环信息
	if (ret) {
		pr_err("Failed to check aio rings (pid: %d)\n", pid);
		goto err_cure_imgset;
	}
ret = parasite_dump_misc_seized(parasite_ctl, &misc);
   //通过向tsock套接字发送消息,PARASITE_CMD_DUMP_MISC会指定调用dump_misc来备份杂项设备信息
	if (ret) {
		pr_err("Can't dump misc (pid: %d)\n", pid);
		goto err_cure_imgset;
	}

	item->pid->ns[0].virt = misc.pid;
	pstree_insert_pid(item->pid);
	item->sid = misc.sid;
	item->pgid = misc.pgid;

	pr_info("sid=%d pgid=%d pid=%d\n",
		item->sid, item->pgid, vpid(item));

	if (item->sid == 0) {
		pr_err("A session leader of %d(%d) is outside of its pid namespace\n",
			item->pid->real, vpid(item));
		goto err_cure;
	}

	cr_imgset = cr_task_imgset_open(vpid(item), O_DUMP);
	//收集任务镜像集信息,所有镜像的信息包含在cr_imgset结构体
	if (!cr_imgset)
		goto err_cure;

	ret = dump_task_ids(item, cr_imgset);
	//根据items中收集的id入口信息把task对象id信息备份到镜像中去
	if (ret) {
		pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret);
		goto err_cure;
	}

	if (dfds) {
		ret = dump_task_files_seized(parasite_ctl, item, dfds);//对dfds中的文件进行备份操作
		if (ret) {
			pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
			goto err_cure;
		}
	}

	mdc.pre_dump = false;

parasite_get_proc_fd_seized()就是通过向daemonsocket发送PARASITE_CMD_GET_PROC_FD命令消息,来让寄生进程获取配置文件的文件描述符

int parasite_get_proc_fd_seized(struct parasite_ctl *ctl)
{
	int ret = -1, fd, sk;

	ret = compel_rpc_call(PARASITE_CMD_GET_PROC_FD, ctl);//向寄生控制单元中呢tsock套接字(daemonized pid process)发送命令消息ctl_msg,输出ctl_msg中的响应内容。通过调用infect.c-plugins/std 中parasite_service-》parasite_init_daemon来创建套件字并对该套接字进行监听,进一步根据发送的PARASITE_CMD_GET_PROC_FD参数调用parasite_get_proc_fd来获取配置文件描述符
	if (ret) {
		pr_err("Parasite failed to get proc fd\n");
		return ret;
	}

	sk = compel_rpc_sock(ctl);//获取该寄生控制单元的套接字对象tsock
	fd = recv_fd(sk);//接收该套接字发回的配置文件描述符对象
	if (fd < 0)
		pr_err("Can't retrieve FD from socket\n");
	if (compel_rpc_sync(PARASITE_CMD_GET_PROC_FD, ctl)) {
	//调用parasite_wait_ack等待daemon socket的响应
		close_safe(&fd);
		return -1;
	}

	return fd;
}

备份/proc/pid/fd下的所有文件描述符内容

//用于备份从/proc/pid/fd下的文件dfds内容,首先配置镜像结构体单元,然后通过调用parasite_drain_fds_seized向tsock迭代发送文件描述符dfds->fds[],然后接收到lfds[],dfds与lfds具有二级嵌套关系,目前还不清楚这层关系的含义。最后进行dump_one_file的单文件备份
int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
		struct parasite_drain_fd *dfds)
{
//dfds就是collect_fds通过读取/proc/pid/fd文件下文件描述符,记录个数->nr_fds和文件描述符内容dfds->fds[]
	int *lfds = NULL;
	struct cr_img *img = NULL;
	struct fd_opts *opts = NULL;//fd的一些选项值
	int i, ret = -1;
	int off, nr_fds = min((int) PARASITE_MAX_FDS, dfds->nr_fds);//nr_fds为文件描述符个数,但不超过最大值

	pr_info("\n");
	pr_info("Dumping opened files (pid: %d)\n", item->pid->real);
	pr_info("----------------------------------------\n");

	lfds = xmalloc(nr_fds * sizeof(int));//分配存储这些文件描述符的内存空间
	if (!lfds)
		goto err;

	opts = xmalloc(nr_fds * sizeof(struct fd_opts));//同时分配存储这些文件描述符中的选项信息的结构体数据空间
	if (!opts)
		goto err;

	img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id);//打开该节点任务文件并将文件描述符添加到img对象中去返回
	if (!img)
		goto err;

	ret = 0; /* Don't fail if nr_fds == 0 */
	for (off = 0; off < dfds->nr_fds; off += nr_fds) {
		if (nr_fds + off > dfds->nr_fds)
			nr_fds = dfds->nr_fds - off;

		ret = parasite_drain_fds_seized(ctl, dfds, nr_fds,
							off, lfds, opts);//lfds的填充是通过向tsock发送dfds中的文件描述符,在通过sock接收的到的文件描述符填充到lfds中去,目前不清楚含义是啥
		if (ret)
			goto err;

		for (i = 0; i < nr_fds; i++) {
			ret = dump_one_file(item->pid, dfds->fds[i + off],
						lfds[i], opts + i, img, ctl);//先获取文件状态,在根据文件状态信息判断文件类型,不同的文件类型用不同的备份方法,do_dump_gen_file最后使用py语言进行备份操作,貌似dfds和lfd具有嵌套关系,目前海不能完全分析清其楚含义。
			close(lfds[i]);
			if (ret)
				break;
		}
	}

	pr_info("----------------------------------------\n");
err:
	if (img)
		close_image(img);
	xfree(opts);
	xfree(lfds);
	return ret;
}

   #define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__)

//这个函数就是为了配置相关img结构体信息,并打开可变变量中item->ids->file_ids指定的文件,返回文件描述符添加到img信息结构体中去,最后返回img结构体。
struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...)
{
	struct cr_img *img;//定义cr镜像结构体
	unsigned long oflags;
	char path[PATH_MAX];//定义路径数组
	va_list args;//用于处理可变参数
	bool lazy = false;

	if (dfd == -1) {
		dfd = get_service_fd(IMG_FD_OFF);//获取service文件描述符
		lazy = (flags & O_CREAT);
	}

	img = xmalloc(sizeof(*img));//为该镜像结构体分配内存
	if (!img)
		return NULL;

	oflags = flags | imgset_template[type].oflags;//flags为O_DUMP,type为CR_FD_FDINFO,imgset_template[CR_FD_FDINFO].oflags=0,所以这里的oflags即为flags(O_DUMP)

	va_start(args, flags);//使得args指向flags后一个参数地址
	vsnprintf(path, PATH_MAX, imgset_template[type].fmt, args);//将可变参数args格式化输出到path数组中去
	va_end(args);

	if (lazy) {
		img->fd = LAZY_IMG_FD;
		img->type = type;
		img->oflags = oflags;
		img->path = xstrdup(path);
		return img;
	} else
		img->fd = EMPTY_IMG_FD;

	if (do_open_image(img, dfd, type, oflags, path)) {
	//dfd就是service_fd,打开path指定路径文件,得到文件描述符并添加到img结构体中去
		close_image(img);
		return NULL;
	}

	return img;
}
//该函数就是为了将根据path路径打开的文件描述符添加到img信息结构体中去
static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long oflags, char *path)
{
	int ret, flags;

	flags = oflags & ~(O_NOBUF | O_SERVICE);//这里的oflags为O_DUMP

	ret = openat(dfd, path, flags, CR_FD_PERM);//打开指定路径文件,dfd就是service_fd,返回文件描述符,path为item->ids->file_ids
	if (ret < 0) {
		if (!(flags & O_CREAT) && (errno == ENOENT)) {
			pr_info("No %s image\n", path);
			img->_x.fd = EMPTY_IMG_FD;
			goto skip_magic;
		}

		pr_perror("Unable to open %s", path);
		goto err;
	}

	img->_x.fd = ret;
	if (oflags & O_NOBUF)
		bfd_setraw(&img->_x);
	else {
		if (flags == O_RDONLY)
			ret = bfdopenr(&img->_x);
		else
			ret = bfdopenw(&img->_x);

		if (ret)
			goto err;
	}

	if (imgset_template[type].magic == RAW_IMAGE_MAGIC)
		goto skip_magic;

	if (flags == O_RDONLY)
		ret = img_check_magic(img, oflags, type, path);
	else
		ret = img_write_magic(img, oflags, type);
	if (ret)
		goto err;

skip_magic:
	return 0;

err:
	return -1;
}

//这个函数的含义还不是很清楚
int parasite_drain_fds_seized(struct parasite_ctl *ctl,
		struct parasite_drain_fd *dfds, int nr_fds, int off,
		int *lfds, struct fd_opts *opts)
{
	int ret = -1, size, sk;
	struct parasite_drain_fd *args;//存储文件描述符及其个数

	size = drain_fds_size(dfds);//分配存储文件描述符的相关数据的存储空间:sizeof(dfds)+nr_fds * (sizeof(dfds->fds[0])+sizeof(struct fd_opts)
	args = compel_parasite_args_s(ctl, size);//返回ctl->addr_args
	args->nr_fds = nr_fds;
	memcpy(&args->fds, dfds->fds + off, sizeof(int) * nr_fds);//拷贝文件描述符到ctl->addr_args->fds

	ret = compel_rpc_call(PARASITE_CMD_DRAIN_FDS, ctl);//调用drain_fds(args);排除文件描述符
	if (ret) {
		pr_err("Parasite failed to drain descriptors\n");
		goto err;
	}

	sk = compel_rpc_sock(ctl);
	ret = recv_fds(sk, lfds, nr_fds, opts, sizeof(struct fd_opts));
	if (ret)
		pr_err("Can't retrieve FDs from socket\n");

	ret |= compel_rpc_sync(PARASITE_CMD_DRAIN_FDS, ctl);
err:
	return ret;
}

对单个文件进行判别后并备份的具体实现

static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
		       struct cr_img *img, struct parasite_ctl *ctl)
{
	struct fd_parms p = FD_PARMS_INIT;//定义文件描述符的参数
	const struct fdtype_ops *ops;
	struct fd_link link;

	if (fill_fd_params(pid, fd, lfd, opts, &p) < 0) {
	//获取lfd指定文件的相关状态并填写到结构体p中去,及文件系统状态,fcntl(lfd, F_GETSIG, 0)获取标识输入输出可进行的信号
		pr_err("Can't get stat on %d\n", fd);
		return -1;
	}

	if (note_file_lock(pid, fd, lfd, &p))//标记文件锁
		return -1;

	p.fd_ctl = ctl; /* Some dump_opts require this to talk to parasite */

//至此开始判断文件类型,做不同文件类型的不同存储备份
	if (S_ISSOCK(p.stat.st_mode))
		return dump_socket(&p, lfd, img);//判断为socket文件也是最后通过调用do_dump_gen_file进行文件备份

	if (S_ISCHR(p.stat.st_mode))
		return dump_chrdev(&p, lfd, img);

	if (p.fs_type == ANON_INODE_FS_MAGIC) {
		char link[32];

		if (read_fd_link(lfd, link, sizeof(link)) < 0)
			return -1;

		if (is_eventfd_link(link))
			ops = &eventfd_dump_ops;
		else if (is_eventpoll_link(link))
			ops = &eventpoll_dump_ops;
		else if (is_inotify_link(link))
			ops = &inotify_dump_ops;
		else if (is_fanotify_link(link))
			ops = &fanotify_dump_ops;
		else if (is_signalfd_link(link))
			ops = &signalfd_dump_ops;
		else if (is_timerfd_link(link))
			ops = &timerfd_dump_ops;
		else
			return dump_unsupp_fd(&p, lfd, img, "anon", link);

		return do_dump_gen_file(&p, lfd, ops, img);
	}

	if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) {
		if (fill_fdlink(lfd, &p, &link))
			return -1;

		p.link = &link;
		if (link.name[1] == '/')
			return do_dump_gen_file(&p, lfd, &regfile_dump_ops, img);

		if (check_ns_proc(&link))
			return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, img);

		return dump_unsupp_fd(&p, lfd, img, "reg", link.name + 1);
	}

	if (S_ISFIFO(p.stat.st_mode)) {
		if (p.fs_type == PIPEFS_MAGIC)
			ops = &pipe_dump_ops;
		else
			ops = &fifo_dump_ops;

		return do_dump_gen_file(&p, lfd, ops, img);//尝试文件id并对该文件进行备份
	}

	/*
	 * For debug purpose -- at least show the link
	 * file pointing to when reporting unsupported file.
	 * On error simply empty string here.
	 */
	if (fill_fdlink(lfd, &p, &link))
		memzero(&link, sizeof(link));

	return dump_unsupp_fd(&p, lfd, img, "unknown", link.name + 1);
}