"criu实现容器进程的恢复"

  "实现容器进程的恢复"

Posted by Xu on August 31, 2017

CRIU实现容器进程的恢复

static int restore_root_task(struct pstree_item *init)
{
	enum trace_flags flag = TRACE_ALL;
	int ret, fd, mnt_ns_fd = -1;
	int root_seized = 0;
	struct pstree_item *item;

	ret = run_scripts(ACT_PRE_RESTORE);
	if (ret != 0) {
		pr_err("Aborting restore due to pre-restore script ret code %d\n", ret);
		return -1;
	}

	fd = open("/proc", O_DIRECTORY | O_RDONLY);
	if (fd < 0) {
		pr_perror("Unable to open /proc");
		return -1;
	}

	ret = install_service_fd(CR_PROC_FD_OFF, fd);//安装配置目录文件描述符
	close(fd);
	if (ret < 0)
		return -1;

	/*
	 * FIXME -- currently we assume that all the tasks live
	 * in the same set of namespaces. This is done to debug
	 * the ns contents dumping/restoring. Need to revisit
	 * this later.
	 */

	if (vpid(init) == INIT_PID) {//是否该root_item的pid为容器初始化进程
		if (!(root_ns_mask & CLONE_NEWPID)) {
			pr_err("This process tree can only be restored "
				"in a new pid namespace.\n"
				"criu should be re-executed with the "
				"\"--namespace pid\" option.\n");
			return -1;
		}
	} else	if (root_ns_mask & CLONE_NEWPID) {
		pr_err("Can't restore pid namespace without the process init\n");
		return -1;
	}

	if (prepare_userns_hook())//将/proc/pid下的loginuid读取到saved_loginuid保存,然后对该criu自身进程的loginuid文件写入无效的uid?
		return -1;

	if (prepare_namespace_before_tasks())//准备好命名空间,见下详解
		return -1;

	__restore_switch_stage_nw(CR_STATE_ROOT_TASK);

	ret = fork_with_pid(init);
	if (ret < 0)
		goto out;

	restore_origin_ns_hook();

	if (rsti(init)->clone_flags & CLONE_PARENT) {
		struct sigaction act;

		root_seized = 1;
		/*
		 * Root task will be our sibling. This means, that
		 * we will not notice when (if) it dies in SIGCHLD
		 * handler, but we should. To do this -- attach to
		 * the guy with ptrace (below) and (!) make the kernel
		 * deliver us the signal when it will get stopped.
		 * It will in case of e.g. segfault before handling
		 * the signal.
		 */
		sigaction(SIGCHLD, NULL, &act);
		act.sa_flags &= ~SA_NOCLDSTOP;
		sigaction(SIGCHLD, &act, NULL);

		if (ptrace(PTRACE_SEIZE, init->pid->real, 0, 0)) {
			pr_perror("Can't attach to init");
			goto out_kill;
		}
	}

	if (!root_ns_mask)
		goto skip_ns_bouncing;

	/*
	 * uid_map and gid_map must be filled from a parent user namespace.
	 * prepare_userns_creds() must be called after filling mappings.
	 */
	if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init))
		goto out_kill;

	pr_info("Wait until namespaces are created\n");
	ret = restore_wait_inprogress_tasks();
	if (ret)
		goto out_kill;

	ret = run_scripts(ACT_SETUP_NS);
	if (ret)
		goto out_kill;

	ret = restore_switch_stage(CR_STATE_PREPARE_NAMESPACES);
	if (ret)
		goto out_kill;

	if (root_ns_mask & CLONE_NEWNS) {
		mnt_ns_fd = open_proc(init->pid->real, "ns/mnt");
		if (mnt_ns_fd < 0)
			goto out_kill;
	}

	if (opts.empty_ns & CLONE_NEWNET) {
		/*
		 * Local TCP connections were locked by network_lock_internal()
		 * on dump and normally should have been C/R-ed by respectively
		 * dump_iptables() and restore_iptables() in net.c. However in
		 * the '--empty-ns net' mode no iptables C/R is done and we
		 * need to return these rules by hands.
		 */
		ret = network_lock_internal();
		if (ret)
			goto out_kill;
	}

	ret = run_scripts(ACT_POST_SETUP_NS);
	if (ret)
		goto out_kill;

	__restore_switch_stage(CR_STATE_FORKING);

skip_ns_bouncing:

	ret = restore_wait_inprogress_tasks();
	if (ret < 0)
		goto out_kill;

	/*
	 * Zombies die after CR_STATE_RESTORE which is switched
	 * by root task, not by us. See comment before CR_STATE_FORKING
	 * in the header for details.
	 */
	for_each_pstree_item(item) {
		if (item->pid->state == TASK_DEAD)
			task_entries->nr_threads--;
	}

	ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
	if (ret < 0)
		goto out_kill;

	ret = stop_usernsd();
	if (ret < 0)
		goto out_kill;

	ret = move_veth_to_bridge();
	if (ret < 0)
		goto out_kill;

	ret = prepare_cgroup_properties();
	if (ret < 0)
		goto out_kill;

	if (fault_injected(FI_POST_RESTORE))
		goto out_kill;

	ret = run_scripts(ACT_POST_RESTORE);
	if (ret != 0) {
		pr_err("Aborting restore due to post-restore script ret code %d\n", ret);
		timing_stop(TIME_RESTORE);
		write_stats(RESTORE_STATS);
		goto out_kill;
	}

	/*
	 * There is no need to call try_clean_remaps() after this point,
	 * as restore went OK and all ghosts were removed by the openers.
	 */
	if (depopulate_roots_yard(mnt_ns_fd, false))
		goto out_kill;

	close_safe(&mnt_ns_fd);

	if (write_restored_pid())
		goto out_kill;

	/* Unlock network before disabling repair mode on sockets */
	network_unlock();

	/*
	 * Stop getting sigchld, after we resume the tasks they
	 * may start to exit poking criu in vain.
	 */
	ignore_kids();

	/*
	 * -------------------------------------------------------------
	 * Below this line nothing should fail, because network is unlocked
	 */
	attach_to_tasks(root_seized);

	ret = restore_switch_stage(CR_STATE_RESTORE_CREDS);
	BUG_ON(ret);

	timing_stop(TIME_RESTORE);

	ret = catch_tasks(root_seized, &flag);

	pr_info("Restore finished successfully. Resuming tasks.\n");
	__restore_switch_stage(CR_STATE_COMPLETE);

	if (ret == 0)
		ret = compel_stop_on_syscall(task_entries->nr_threads,
			__NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag);

	if (clear_breakpoints())
		pr_err("Unable to flush breakpoints\n");

	if (ret == 0)
		finalize_restore();

	ret = run_scripts(ACT_PRE_RESUME);
	if (ret)
		pr_err("Pre-resume script ret code %d\n", ret);

	if (restore_freezer_state())
		pr_err("Unable to restore freezer state\n");

	fini_cgroup();

	/* Detaches from processes and they continue run through sigreturn. */
	finalize_restore_detach(ret);

	write_stats(RESTORE_STATS);

	ret = run_scripts(ACT_POST_RESUME);
	if (ret != 0)
		pr_err("Post-resume script ret code %d\n", ret);

	if (!opts.restore_detach && !opts.exec_cmd)
		wait(NULL);

	return 0;

out_kill:
	/*
	 * The processes can be killed only when all of them have been created,
	 * otherwise an external proccesses can be killed.
	 */
	if (root_ns_mask & CLONE_NEWPID) {
		int status;

		/* Kill init */
		if (root_item->pid->real > 0)
			kill(root_item->pid->real, SIGKILL);

		if (waitpid(root_item->pid->real, &status, 0) < 0)
			pr_warn("Unable to wait %d: %s",
				root_item->pid->real, strerror(errno));
	} else {
		struct pstree_item *pi;

		for_each_pstree_item(pi)
			if (vpid(pi) > 0)
				kill(vpid(pi), SIGKILL);
	}

out:
	fini_cgroup();
	depopulate_roots_yard(mnt_ns_fd, true);
	stop_usernsd();
	__restore_switch_stage(CR_STATE_FAIL);
	pr_err("Restoring FAILED.\n");
	return -1;
}

prepareNS

准备命名空间

int prepare_namespace_before_tasks(void)
{
	if (start_usernsd())//新建一个域套接字对sk,设置两个套接字入口都允许SCM_CREDENTIALS控制消息的接收,允许接收进程凭证,新建一个子进程返回pid到usernsd_pid,子进程关闭sk[0],然后循环从sk[1]端接收消息到缓冲区msghdr,父进程则安装sk[0]到USERNSD_SK中去供此后的消息传输。见详解!
		goto err_unds;

	if (netns_keep_nsfd())//打开/proc/ns/net文件后将文件描述符安装到NS_FD_OFF
		goto err_netns;

	if (mntns_maybe_create_roots())//创建挂载根目录/tmp/.criu.mntns.XXXXX
		goto err_mnt;

	if (read_mnt_ns_img())//根据镜像文件ns_ids遍历mnt_ns_desc,调用collect_mnt_from_image收集挂载信息到全局变量mntinfo中去
		goto err_img;

	return 0;

err_img:
	cleanup_mnt_ns();
err_mnt:
	/*
	 * Nothing, netns' descriptor will be closed
	 * on criu exit
	 */
err_netns:
	stop_usernsd();
err_unds:
	return -1;
}

开启userns的守护进程来监听数据请求,这里体现socketpair父子进程通信的方法

static int start_usernsd(void)
{
	int sk[2];//公用数据段
	int one = 1;

	if (!(root_ns_mask & CLONE_NEWUSER))
		return 0;

	/*
	 * Seqpacket to
	 *
	 * a) Help daemon distinguish individual requests from
	 *    each other easily. Stream socket require manual
	 *    messages boundaries.
	 *
	 * b) Make callers note the damon death by seeing the
	 *    disconnected socket. In case of dgram socket
	 *    callers would just get stuck in receiving the
	 *    response.
	 */

	if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) {
	//创建一个socketpair对用于Unix域内通信
		pr_perror("Can't make usernsd socket");
		return -1;
	}

	if (setsockopt(sk[0], SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)) < 0) {//将该socket对中的sk[0]设置为可由SCM_CREDENTIALS控制消息的接收和发送,允许可以发送进程ucred凭证
		pr_perror("failed to setsockopt");
		return -1;
	}

	if (setsockopt(sk[1], SOL_SOCKET, SO_PASSCRED, &one, sizeof(1)) < 0) {//将该socket对中的sk[1]设置为可由SCM_CREDENTIALS控制消息的接收和发送进程ucred凭证

		pr_perror("failed to setsockopt");
		return -1;
	}

	usernsd_pid = fork();//创建子进程,子进程会复制上方代码中的数据端部分到自己的进程的内存空间,然后执行下方部分代码
	if (usernsd_pid < 0) {
		pr_perror("Can't fork usernsd");
		close(sk[0]);
		close(sk[1]);
		return -1;
	}

	if (usernsd_pid == 0) {//子进程进入该代码段
		int ret;

		close(sk[0]);//关闭socket对中的sk[0]这一端在子进程中的引用
		ret = usernsd(sk[1]);//开启userns daemon,见详解
		exit(ret);
	}

	close(sk[1]);//父进程会执行这部分代码(子进程跳出循环后也会执行这部分代码),关闭父进程中对sk[1]这一端的引用。
	if (install_service_fd(USERNSD_SK, sk[0]) < 0) {//将sk[0]这一端安装到USERNSD_SK供以后给子进程发送或接收消息
		kill(usernsd_pid, SIGKILL);
		waitpid(usernsd_pid, NULL, 0);
		close(sk[0]);
		return -1;
	}

	close(sk[0]);//关闭父进程中的sk[0]的引用
	return 0;
}


//开启用户空间守护进程,接收sk传过来的消息,到unsc_msg->h(msghdr)存储,然后执行消息中所要求的命令call,最后同样unsc_msg->h(msghdr)方式发出响应。
static int usernsd(int sk)
{
	pr_info("uns: Daemon started\n");

	while (1) {
	     /*
		*struct unsc_msg {
	*struct msghdr h;
	/*
	 * 0th is the call address,调用的命令地址
	 * 1st is the flags 命令的参数
	 * 2nd is the optional (NULL in response) arguments,命令的可选参数
	 */
	*struct iovec iov[3];
	*char c[CMSG_SPACE(sizeof(struct ucred))+CMSG_SPACE(sizeof(int))];
*};
		
		*/
		struct unsc_msg um;//userns控制信息结构体
		
		
		static char msg[MAX_UNSFD_MSG_SIZE];//消息数组
		uns_call_t call;//定义一个返回int类型的函数call
		int flags, fd, ret;
		pid_t pid;

		unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);//userns控制消息结构体初始化,将um.h缓冲区对象定位到um.iov[],接收到的消息会存放到um.iov[0](命令),um.iov[1](参数)中,如果msg有参数,则赋值到um.iov[2]中,同时um.h.msg_control附属数据cmsghdr也定位到um.c中保存。存放该进程的凭证信息到um->h的第一个cmsghdr中,fd(最后一个参数0)也会存放到um->msghdr中的第二个cmsghdr中保存。
		//所以sk[1]接收的消息,命令放在um.iov[0],参数放在um.iov[1],可选参数放在um.iov[3],附属数据存放在um.c中
		if (recvmsg(sk, &um.h, 0) <= 0) {//接受数据存放在um.h缓冲区
			pr_perror("uns: recv req error");
			return -1;
		}

		unsc_msg_pid_fd(&um, &pid, &fd);//根据接收到的消息um中的第一个cmsghdr中凭证配置pid,以及第二个cmsghdr中的数据fd配置fd
		pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags);

		BUG_ON(fd < 0 && flags & UNS_FDOUT);

		/*
		 * Caller has sent us bare address of the routine it
		 * wants to call. Since the caller is fork()-ed from the
		 * same process as the daemon is, the latter has exactly
		 * the same code at exactly the same address as the
		 * former guy has. So go ahead and just call one!
		 */

		ret = call(msg, fd, pid);//执行传递过来的消息命令

		if (fd >= 0)
			close(fd);

		if (flags & UNS_ASYNC) {
			/*
			 * Async call failed and the called doesn't know
			 * about it. Exit now and let the stop_usernsd()
			 * check the exit code and abort the restoration.
			 *
			 * We'd get there either by the end of restore or
			 * from the next userns_call() due to failed
			 * sendmsg() in there.
			 */
			if (ret < 0) {
				pr_err("uns: Async call failed. Exiting\n");
				return -1;
			}

			continue;
		}

		if (flags & UNS_FDOUT)
			fd = ret;
		else
			fd = -1;

		unsc_msg_init(&um, &call, &ret, NULL, 0, fd);//同上,
		if (sendmsg(sk, &um.h, 0) <= 0) {//发送响应数据
			pr_perror("uns: send resp error");
			return -1;
		}

		if (fd >= 0)
			close(fd);
	}
}