"criu dump重要步骤的深入研究"

  "criu checkpoint四"

Posted by Xu on June 7, 2017

CRIU,DUMP重要步骤解析

根据前一章节,对dump过程进行整体性的了解,我们可以看到dump过程十分复杂繁琐,涉及到对随进程分布的各方面信息收集和备份,如果我们对每一个步骤进行详细的了解,会严重拖慢研究进度,并且大部分研究并不是解决Docker容器热迁移的问题所在,所以我们先研究部分在备份过程中十分重要的步骤,包括:

(1) 信息收集过程中:

  1. 进程树的收集
  2. 进程树id的收集
  3. 对网络的锁操作
  4. 命名空间信息的收集

(2) 信息备份的过程中有

  1. 对每个树节点的备份
  2. mnt_namespace命名空间的备份
  3. 文件锁信息备份
  4. 根据根节点备份进程树
  5. 备份CGroup信息
  6. 备份共享内存

(3) 写入镜像工厂

这一章节将分析(1)-1.进程树信息的收集:

1.进程树信息的收集collect_pstree()

  1. 如果opts.freeze_cgroup需要冻结Cgroup,则对Cgroup该pid下的所有进程进行freeze_processes中断冻结操作
  2. 不需要则compel_interrupt_task只中断该pid所指定的进程,然后compel_wait_task等待该进程返回状态信息
  3. collect_task收集进程树下所有线程和子进程的信息并冻结
  4. 最后等待freeze_processes冻结的进程返回信号,进程树信息收集完毕
int collect_pstree(void)
{
	pid_t pid = root_item->pid->real;//获取根节点pid的值
	int ret = -1;
	struct proc_status_creds *creds;//配置文件状态信誉证书结构体,我也不知道这是干嘛的

	timing_start(TIME_FREEZING);//获得存放时间信息的结构体tm并设置好进程备份开始时间&tm->start

	/*
	 * wait4() may hang for some reason. Enable timer and fire SIGALRM
	 * if timeout reached. SIGALRM handler will do  the necessary
	 * cleanups and terminate current process.
	 */
	alarm(opts.timeout);//定义一个计时器,超过规定的时间间隔,向进程发送信号终止该进程

	if (opts.freeze_cgroup && freeze_processes())
		goto err;//如果需要冻结CGroup,则调用freeze_processes()来执行,实现对进程状态文件的改变及Cgroup下进程的中断,freeze_processes()分析在第二个模块。

	if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
	    //如果不需要冻结Cgroup,则强制冻结pid指定的进程即可
		set_cr_errno(ESRCH);
		goto err;
	}

	creds = xzalloc(sizeof(*creds));//分配信誉证书内存空间
	if (!creds)
		goto err;

	ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds->s, NULL);//强制等待子进程返回信号,对子进程的状态进行解析后返回对应信号值:COMPEL_TASK_ZOMBIE,COMPEL_TASK_DEAD,COMPEL_TASK_STOPPED,COMPEL_TASK_ALIVE四种
	if (ret < 0)
		goto err;

	if (ret == TASK_ZOMBIE)
		ret = TASK_DEAD;
	else
		processes_to_wait--;

	pr_info("Seized task %d, state %d\n", pid, ret);
	root_item->pid->state = ret;
	dmpi(root_item)->pi_creds = creds;

	ret = collect_task(root_item);//收集任务进程树(包括子进程和线程)信息,对该进程下所有子进程及线程进行冻结操作
	if (ret < 0)
		goto err;

	if (opts.freeze_cgroup && freezer_wait_processes()) {
        //等待freeze_processes操作已冻结进程的恢复
		ret = -1;
		goto err;
	}

	ret = 0;
	timing_stop(TIME_FREEZING);//计算freezing时间
	timing_start(TIME_FROZEN);//纪录进入frozen状态的开始时间

err:
	/* Freezing stage finished in time - disable timer. */
	alarm(0);
	return ret;
}

2.freeze_processes()源码分析:先向状态文件内成功写入状态,再调用seize_cgroup_tree对Cgroup下所有pid进程树进行中断操作

static int freeze_processes(void)
{
	int fd, exit_code = -1;
	char path[PATH_MAX];//定义路径数组
	const char *state = thawed;//static const char thawed[]	= "THAWED";

	static const unsigned long step_ms = 100;
	unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms;
	unsigned long i = 0;

	const struct timespec req = {
		.tv_nsec	= step_ms * 1000000,
		.tv_sec		= 0,
	};

	if (unlikely(!nr_attempts)) {
	//这个牛逼,ulikely是gcc的内建函数,通过对分支预测来合理装载下一条命令从而提高程序运行速度,此处程序会预先加载下一条pr_debug命令因为程序员预测(!nr_attempts)为真的概率较小,从而减少来指令的跳转。
		/*
		 * If timeout is turned off, lets
		 * wait for at least 10 seconds.
		 */
		nr_attempts = (10 * 1000000) / step_ms;
	}

	pr_debug("freezing processes: %lu attempst with %lu ms steps\n",
		 nr_attempts, step_ms);

	snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);//把路径写到path变量中去
	fd = open(path, O_RDWR);//读写模式打开文件,获取文件描述符
	if (fd < 0) {
		pr_perror("Unable to open %s", path);
		return -1;
	}
	state = get_freezer_state(fd);//根据文件描述符获取冻结状态,就是读取read该文件中的内容,比对状态获取冻结状态:frozen(已冻结),freezing(冻结中),thawed(解冻状态)
	if (!state) {
		close(fd);
		return -1;
	}
	if (state == thawed) {//解冻状态
		freezer_thawed = true;

		lseek(fd, 0, SEEK_SET);//写文件,将冻结状态写到状态文件中去
		if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
		//static const char frozen[]	= "FROZEN";,写状态到状态文件
			pr_perror("Unable to freeze tasks");
			close(fd);
			return -1;
		}

		/*
		 * Wait the freezer to complete before
		 * processing tasks. They might be exiting
		 * before freezing complete so we should
		 * not read @tasks pids while freezer in
		 * transition stage.
		 */
		for (; i <= nr_attempts; i++) {
			state = get_freezer_state(fd);
			if (!state) {
				close(fd);
				return -1;
			}

			if (state == frozen)
				break;
			if (alarm_timeouted())
				goto err;
			nanosleep(&req, NULL);
		}//这个循环就是为了等待,成功把状态写入到状态文件

		if (i > nr_attempts) {
			pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);//循环结束后,依然无法得到状态文件中的状态为frozen,说明写状态文件超时,无法冻结
			if (!pr_quelled(LOG_DEBUG))
				log_unfrozen_stacks(opts.freeze_cgroup);
			goto err;
		}

		pr_debug("freezing processes: %lu attempts done\n", i);
	}

	/*
	 * Pay attention on @i variable -- it's continuation.
	 */
	for (; i <= nr_attempts; i++) {
		exit_code = seize_cgroup_tree(opts.freeze_cgroup, state);//对Cgroup下的进程树进行迭代中断操作
		if (exit_code == -EAGAIN) {
			if (alarm_timeouted())
				goto err;
			nanosleep(&req, NULL);
		} else
			break;
	}

err:
	if (exit_code == 0 || freezer_thawed) {
		lseek(fd, 0, SEEK_SET);
		if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
			pr_perror("Unable to thaw tasks");
			exit_code = -1;
		}
	}
	if (close(fd)) {
		pr_perror("Unable to thaw tasks");
		return -1;
	}

	return exit_code;
}

3.seize_cgroup_tree()源码分析:该函数就是用于迭代中断Cgroup下的进程树利用Ptrace(PTRACE_SEIZE, pid, NULL, 0)和ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);从外部进程获取指定进程的控制权,并进行Interrupt操作

static int seize_cgroup_tree(char *root_path, const char *state)
{
	DIR *dir;// 声明一个目录对象
	struct dirent *de;//声明一个目录结构体,包含该目录下所有文件的相关信息指针
	char path[PATH_MAX];
	FILE *f;

	/*
	 * New tasks can appear while a freezer state isn't
	 * frozen, so we need to catch all new tasks.
	 */
	snprintf(path, sizeof(path), "%s/tasks", root_path);//路径填写入变量path
	f = fopen(path, "r");//打开path目录下的文件
	if (f == NULL) {
		pr_perror("Unable to open %s", path);
		return -1;
	}
	while (fgets(path, sizeof(path), f)) {//fget读取指定f文件的字符串到path路径下存储
		pid_t pid;
		int ret;

		pid = atoi(path);//将path转化为整数返回

		/* Here we are going to skip tasks which are already traced. */
		ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);//ptrace函数介绍:http://blog.csdn.net/txl199106/article/details/48001869  2. http://blog.csdn.net/qq1084283172/article/details/45967633   停止追踪并冻结中断
		if (ret == 0)
			continue;
		if (errno != ESRCH) {
			pr_perror("Unexpected error");
			fclose(f);
			return -1;
		}

		if (!compel_interrupt_task(pid)) {
            //compel_interrupt_task调用ptrace (PTRACE_SEIZE, pid, NULL, 0);开启对进程跟踪然后调用PTRACE_INTERRUPT来强制中断该pid进程
			pr_debug("SEIZE %d: success\n", pid);
			processes_to_wait++;
		} else if (state == frozen) {
		     //这里就是当中断进程失败时发现进程为frozen进程,即发现了僵尸进程
			char buf[] = "/proc/XXXXXXXXXX/exe";
			struct stat st;

			/* skip kernel threads */
			snprintf(buf, sizeof(buf), "/proc/%d/exe", pid);
			if (stat(buf, &st) == -1 && errno == ENOENT)
				continue;
			/*
			 * fails when meets a zombie, or eixting process:
			 * there is a small race in a kernel -- the process
			 * may start exiting and we are trying to freeze it
			 * before it compete exit procedure. The caller simply
			 * should wait a bit and try freezing again.
			 */
			pr_err("zombie found while seizing\n");//发现僵尸进程
			fclose(f);
			return -EAGAIN;
		}
	}
	fclose(f);

	dir = opendir(root_path);//打开根目录
	if (!dir) {
		pr_perror("Unable to open %s", root_path);
		return -1;
	}

	while ((de = readdir(dir))) {
	//循环读取目录下的文件内容,对每个cgroup下的进程迭代进行中断操作
		struct stat st;
		int ret;

		if (dir_dots(de))
			continue;

		sprintf(path, "%s/%s", root_path, de->d_name);

		if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) {
			pr_perror("stat of %s failed", path);
			closedir(dir);
			return -1;
		}

		if (!S_ISDIR(st.st_mode))
			continue;
		ret = seize_cgroup_tree(path, state);//对Cgroup下进程迭代中断,直到把Cgroup下的进程树全部中断
		if (ret < 0) {
			closedir(dir);
			return ret;
		}
	}
	closedir(dir);

	return 0;
}

4.compel_wait_task()源码分析:强制等待task从中断中恢复,根据子进程返回的状态信息返回信号值:COMPEL_TASK_ZOMBIE,COMPEL_TASK_DEAD,COMPEL_TASK_STOPPED,COMPEL_TASK_ALIVE

int compel_wait_task(int pid, int ppid,
		int (*get_status)(int pid, struct seize_task_status *, void *),
		void (*free_status)(int pid, struct seize_task_status *, void *),
		struct seize_task_status *ss, void *data)
{
	siginfo_t si;
	int status, nr_sigstop;
	int ret = 0, ret2, wait_errno = 0;

	/*
	 * It's ugly, but the ptrace API doesn't allow to distinguish
	 * attaching to zombie from other errors. Thus we have to parse
	 * the target's /proc/pid/stat. Sad, but parse whatever else
	 * we might need at that early point.
	 */

try_again:

	ret = wait4(pid, &status, __WALL, NULL);//wait4挂起当前进程,等待pid指定进程状态改变,并会返回指定使用资源的信息,status为返回的状态,_WALL为返回选项,即告知该如何返回,NULL本应该为rusage纪录死亡进程使用信息
	if (ret < 0) {
		/*
		 * wait4() can expectedly fail only in a first time
		 * if a task is zombie. If we are here from try_again,
		 * this means that we are tracing this task.
		 *
		 * So here we can be only once in this function.
		 */
		wait_errno = errno;
	}

	ret2 = get_status(pid, ss, data);//该函数是调用compel_wait_task传进来的参数:parse_pid_status从而解析得到pid相关的状态信息,成功返回0,失败返回-1,成功则把状态信息写入结构体ss。
	if (ret2)//ret2非零,即返回-1,状态解析失败
		goto err;

	if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) {
	     //通过解析返回的状态,WIFEXITED子进程正常退出返回非零,WIFSIGNALED子进程因为一个没有被捕捉的信号推出返回true
		if (ss->state != 'Z') {
			if (pid == getpid())
				pr_err("The criu itself is within dumped tree.\n");
			else
				pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n",
						pid, ss->state, ret, wait_errno);
			return -1;
		}

		if (ret < 0)
			return COMPEL_TASK_ZOMBIE;
		else
			return COMPEL_TASK_DEAD;
	}

	if ((ppid != -1) && (ss->ppid != ppid)) {
		pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
				pid, ppid, ss->ppid);
		goto err;
	}

	if (!WIFSTOPPED(status)) {//WIFSTOPPED,子进程停止时返回true
		pr_err("SEIZE %d: task not stopped after seize\n", pid);
		goto err;
	}

	ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si);//获取导致进程中断的信号信息,并存放到si结构体
	if (ret < 0) {
		pr_perror("SEIZE %d: can't read signfo", pid);
		goto err;
	}

	if (PTRACE_SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
	//PTRACE_SI_EVENT和0xFFFF做与操作得到信号值
	
		/*
		 * Kernel notifies us about the task being seized received some
		 * event other than the STOP, i.e. -- a signal. Let the task
		 * handle one and repeat.
		 */

		if (ptrace(PTRACE_CONT, pid, NULL,
					(void *)(unsigned long)si.si_signo)) {
			//重启被跟踪的进程,如果si.si_signo非空,即为发送给该进程的信号数量
			pr_perror("Can't continue signal handling, aborting");
			goto err;
		}

		ret = 0;
		if (free_status)//本例中free_statsu为空
			free_status(pid, ss, data);
		goto try_again;
	}

	if (ss->seccomp_mode != SECCOMP_MODE_DISABLED && ptrace_suspend_seccomp(pid) < 0)
		goto err;

	nr_sigstop = 0;
	if (ss->sigpnd & (1 << (SIGSTOP - 1)))
		nr_sigstop++;
	if (ss->shdpnd & (1 << (SIGSTOP - 1)))
		nr_sigstop++;
	if (si.si_signo == SIGSTOP)
		nr_sigstop++;

	if (nr_sigstop) {
		if (skip_sigstop(pid, nr_sigstop))
			goto err_stop;

		return COMPEL_TASK_STOPPED;
	}

	if (si.si_signo == SIGTRAP)
		return COMPEL_TASK_ALIVE;
	else {
		pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo);
		goto err;
	}

err_stop:
	kill(pid, SIGSTOP);
err:
	if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
		pr_perror("Unable to detach from %d", pid);
	return -1;
}

5.collect_task(root_item):根据一个根节点,收集该父进程下所有子进程和线程的信息,并对它们实现中断冻结操作

static int collect_task(struct pstree_item *item)
{
	int ret;

	ret = collect_loop(item, collect_threads);//对进程中的线程逐个冻结中断
	if (ret < 0)
		goto err_close;

	/* Depth-first search (DFS) is used for traversing a process tree. */
	ret = collect_loop(item, collect_children);//同上,对进程的子进程进行逐个冻结操作,collect_children与collect_threads只是收集的目录不同,collect_children收集的是子进程信息
	if (ret < 0)
		goto err_close;

	if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) {
		pr_err("Zombie with children?! O_o Run, run, run!\n");
		goto err_close;
	}

	if (pstree_alloc_cores(item))//分配所有线程存储内存单元
		goto err_close;

	pr_info("Collected %d in %d state\n", item->pid->real, item->pid->state);
	return 0;

err_close:
	close_pid_proc();
	return -1;
}

6.collect_threads()收集线程信息:先根据/pid/task目录收集线程信息threads,根据线程信息逐个冻结

static int collect_threads(struct pstree_item *item)
{
	struct pid *threads = NULL;
	int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;

	ret = parse_threads(item->pid->real, &threads, &nr_threads);//解析线程,循环读取pid/task下所有目录信息,并存放到 &threads结构体中
	if (ret < 0)
		goto err;

	if ((item->pid->state == TASK_DEAD) && (nr_threads > 1)) {
	     //nr_threads>0说明该进程拥有线程,当该进程状态为TASK_DEAD状态时是不支持的情况
		pr_err("Zombies with threads are not supported\n");
		goto err;
	}

	/* The number of threads can't be less than already frozen */
	item->threads = xrealloc(item->threads, nr_threads * sizeof(struct pid));//根据线程的数量nr_threads分配内存空间
	if (item->threads == NULL)
		return -1;

	if (item->nr_threads == 0) {//这里nr_threads为0,说明没有线程
		item->threads[0].real = item->pid->real;
		item->nr_threads = 1;
		item->threads[0].item = NULL;
	}

	nr_inprogress = 0;
	for (i = 0; i < nr_threads; i++) {
		pid_t pid = threads[i].real;
		struct proc_status_creds t_creds = {};

		if (thread_collected(item, pid))//根据解析线程的过程得到的线程信息结构体threads来确认每一个线程确实在该进程之内
			continue;

		nr_inprogress++;//表示已确认的在进程中线程数量

		pr_info("\tSeizing %d's %d thread\n",
				item->pid->real, pid);

		if (!opts.freeze_cgroup && compel_interrupt_task(pid))
			continue;//如果选项中需要冻结Cgroup则强制中断该线程

		ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL);//不需要冻结Cgroup则强制等待子进程返回信号
		if (ret < 0) {
			/*
			 * Here is a race window between parse_threads() and seize(),
			 * so the task could die for these time.
			 * Don't worry, will try again on the next attempt. The number
			 * of attempts is restricted, so it will exit if something
			 * really wrong.
			 */
			continue;
		}

		if (ret == TASK_ZOMBIE)
			ret = TASK_DEAD;
		else
			processes_to_wait--;

		BUG_ON(item->nr_threads + 1 > nr_threads);
		item->threads[item->nr_threads].real = pid;
		item->threads[item->nr_threads].item = NULL;
		item->nr_threads++;

		if (ret == TASK_DEAD) {
			pr_err("Zombie thread not supported\n");
			goto err;
		}

		if (!creds_dumpable(dmpi(item)->pi_creds, &t_creds))
			goto err;

		if (ret == TASK_STOPPED) {
			nr_stopped++;
		}
	}

	if (nr_stopped && nr_stopped != nr_inprogress) {
		pr_err("Individually stopped threads not supported\n");
		goto err;
	}

	xfree(threads);
	return nr_inprogress;

err:
	xfree(threads);
	return -1;
}

读了这么久的源码,本来以为只是一个简简单单的进程树的收集,但函数之间调来调去也是非常复杂的,延伸开来终于把整个进程树的收集过程整理清楚,着实很有成就感,下一章节将就(1)-2 进程树ids信息的收集展开研究