"criu dump重要步骤的深入研究八"

  "criu checkpoint十一"

Posted by Xu on June 28, 2017

CRIU,DUMP备份步骤解析

通过上一小节的努力,我们了解进程备份过程中最为复杂也是最核心的部分:内存的备份,在dump_one_task这个函数中,内存的备份应该是最耗时的部分,所以接下来的步骤分析会相对简单一点,今天我们研究的步骤一共有4个步骤如下:

ret = parasite_dump_sigacts_seized(parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid);
		goto err_cure;
	}

	ret = parasite_dump_itimers_seized(parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump itimers (pid: %d)\n", pid);
		goto err_cure;
	}

	ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump posix timers (pid: %d)\n", pid);
		goto err_cure;
	}

	ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset);
	if (ret) {
		pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
		goto err_cure;
	}
	}

timer&sigact

1.parasite_dump_sigacts_seized:备份信号动作

int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct pstree_item *item)
{
	TaskCoreEntry *tc = item->core[0]->tc;
	struct parasite_dump_sa_args *args;//定义信号动作备份参数
	int ret, sig;
	SaEntry *sa, **psa;

	args = compel_parasite_args(ctl, struct parasite_dump_sa_args);//返回ctl->addr_args

	ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_SIGACTS, ctl);//发送命令信息,调用dump_sigact,备份除了SIGSTOP和SIGKILL的信号动作,dump_sigact见详解
	if (ret < 0)
		return ret;

	psa = xmalloc((SIGMAX - 2) * (sizeof(SaEntry *) + sizeof(SaEntry)));//为所有除了SIGSTOP和SIGKILL的信号分配内存空间,psa为这段内存空间的起始地址
	if (!psa)
		return -1;

	sa = (SaEntry *)(psa + SIGMAX - 2);//sa为这段内存空间的结束地址

	tc->n_sigactions = SIGMAX - 2;//备份的信号动作数量
	tc->sigactions = psa;//备份的信号动作存储起始地址
//以下就是从sa开始存放每一个信号的相关信息到对应结构体SaEntry中去,然后复制到psa中去
	for (sig = 1; sig <= SIGMAX; sig++) {
		int i = sig - 1;

		if (sig == SIGSTOP || sig == SIGKILL)
			continue;

		sa_entry__init(sa);
		ASSIGN_TYPED(sa->sigaction, encode_pointer(args->sas[i].rt_sa_handler));
		ASSIGN_TYPED(sa->flags, args->sas[i].rt_sa_flags);
		ASSIGN_TYPED(sa->restorer, encode_pointer(args->sas[i].rt_sa_restorer));//ASSIGN_TYPED就是赋值操作
		BUILD_BUG_ON(sizeof(sa->mask) != sizeof(args->sas[0].rt_sa_mask.sig));
		memcpy(&sa->mask, args->sas[i].rt_sa_mask.sig, sizeof(sa->mask));
		sa->has_compat_sigaction = true;
		sa->compat_sigaction = !compel_mode_native(ctl);
         
		*(psa++) = sa++;//将sa中的内容复制到psa中去,why?直接复制不就行了吗
	}

	return 0;
}
//备份信号动作
static int dump_sigact(struct parasite_dump_sa_args *da)
{
	int sig, ret = 0;

	for (sig = 1; sig <= SIGMAX; sig++) {
		int i = sig - 1;

		if (sig == SIGKILL || sig == SIGSTOP)
			continue;

		ret = sys_sigaction(sig, NULL, &da->sas[i], sizeof(k_rtsigset_t));//sys_sigaction是检测和更改信号处理方式的函数,sig为待处理信号的编号,NULL为act,当act非空时则该信号的处理方式由act指定,&da->sas[i]为oldact,oldact非空时,则该信号的前一个处理方式保存到oldact中即&da->sas[i]中
		if (ret < 0) {
			pr_err("sys_sigaction failed (%d)\n", ret);
			break;
		}
	}

	return ret;
}

2.parasite_dump_itimers_seized:备份三个计时器的值到item->core->tc->timers

int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item)
{
	CoreEntry *core = item->core[0];
	struct parasite_dump_itimers_args *args;
	int ret;

	args = compel_parasite_args(ctl, struct parasite_dump_itimers_args);

	ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_ITIMERS, ctl);//发送rpc命令信息,调用dump_itimers,见下方详解
	if (ret < 0)
		return ret;

	encode_itimer((&args->real), (core->tc->timers->real));			\
	encode_itimer((&args->virt), (core->tc->timers->virt));			\
	encode_itimer((&args->prof), (core->tc->timers->prof));			\
//encode_itimer复制操作,将args->prof中的内容复制到core->tc->timers->prof中去
	return 0;
}
//存放三个计数器的值到args中
static int dump_itimers(struct parasite_dump_itimers_args *args)
{
	int ret;
    
    /*每一个进程有三个计时器
     *ITIMER_REAL,ITIMER_VIRTUAL,ITIMER_PROF
     *ITIMER_REAL   在real   time中计数器减1,然后等计数往比后发送SIGALRM信号。  
     *ITIMER_VIRTUAL   当进程在执行的过程中计数,然后当计数完毕后发送SIGVTALRM信号给该进程
     *ITIMER_PROF   在该进程被执行和系统在代表该进程执行的时间都进行计数,然后当计数完毕后发送SIGPROF信号给该进程
     */
	ret = sys_getitimer(ITIMER_REAL, &args->real);
	if (!ret)
		ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt);//getitimer就是将该计数器的值存放到&args->virt指定对象中
	if (!ret)
		ret = sys_getitimer(ITIMER_PROF, &args->prof);

	if (ret)
		pr_err("getitimer failed (%d)\n", ret);

	return ret;
}

3.parasite_dump_posix_timers_seized():获取计时器配置文件中信息及实时计时器的值和超限次数等信息到item->core->tc->timers中去

int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args,
		struct parasite_ctl *ctl, struct pstree_item *item)
{
//传进来的proc_args为前面解析posix_timer的步骤中通过读取posix_timer文件得到的配置参数
	CoreEntry *core = item->core[0];
	TaskTimersEntry *tte = core->tc->timers;
	PosixTimerEntry *pte;
	struct proc_posix_timer *temp;
	struct parasite_dump_posix_timers_args *args;
	int args_size;
	int ret = 0;
	int i;

	if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte))
	//在tte结构体中,分配内存存放(proc_args->timer_n+1)个PosixTimerEntry对象
		return -1;

	args_size = posix_timers_dump_size(proc_args->timer_n);//返回备份proc_args->timer_n个posix_timers所需要的内存大小
	args = compel_parasite_args_s(ctl, args_size);
	args->timer_n = proc_args->timer_n;//将proc_args中计时器的个数复制到args中去

	i = 0;
	list_for_each_entry(temp, &proc_args->timers, list) {
		args->timer[i].it_id = temp->spt.it_id;
		i++;
	}//将proc_args中的所有计时器对象id复制到args中去

	ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl);//调用dump_posix_timers,对posix计时器进行备份
	if (ret < 0)
		goto end_posix;

	i = 0;
	list_for_each_entry(temp, &proc_args->timers, list) {
		posix_timer_entry__init(&pte[i]);
		encode_posix_timer(&args->timer[i], temp, &pte[i]);//将proc_args->timers和args->timer[i]的值全复制到pte对象中去
		tte->posix[i] = &pte[i];//将填充好的pte复制到tte中的去
		i++;
	}

end_posix:
	free_posix_timers(proc_args);
	return ret;
}

//将posix定时器的值和超限次数存放到args中
static int dump_posix_timers(struct parasite_dump_posix_timers_args *args)
{
	int i;
	int ret = 0;

	for(i = 0; i < args->timer_n; i++) {
		ret = sys_timer_gettime(args->timer[i].it_id, &args->timer[i].val);//获取定时器的值存放到&args->timer[i].val
		if (ret < 0) {
			pr_err("sys_timer_gettime failed (%d)\n", ret);
			return ret;
		}
		args->timer[i].overrun = sys_timer_getoverrun(args->timer[i].it_id);//返回了指定id的定时器超限的次数,比如一个1ms的定时器运行了10ms,则返回9次,并存放到args->timer[i].overrun中去
		ret = args->timer[i].overrun;
		if (ret < 0) {
			pr_err("sys_timer_getoverrun failed (%d)\n", ret);
			return ret;
		}
	}

	return ret;
}

core

4.dump_task_core_all 备份线程所有相关的内核信息:可执行域信息,内核信息(安全传输层协议,用户互斥锁,调度策略,调度参数,优先级信号栈等等),资源限制(虚拟内存,cpu,文件大小,锁,内存锁等等),线程cgroup信息

static int dump_task_core_all(struct parasite_ctl *ctl,
			      struct pstree_item *item,
			      const struct proc_pid_stat *stat,
			      const struct cr_imgset *cr_imgset)
{
	struct cr_img *img;
	CoreEntry *core = item->core[0];
	pid_t pid = item->pid->real;
	int ret = -1;
	struct proc_status_creds *creds;
	struct parasite_dump_cgroup_args cgroup_args, *info = NULL;//定义Cgroup备份参数

	BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN);

	pr_info("\n");
	pr_info("Dumping core (pid: %d)\n", pid);
	pr_info("----------------------------------------\n");

	ret = get_task_personality(pid, &core->tc->personality);//读取/proc/pid/personality内可执行域信息,到&core->tc->personality,
	if (ret < 0)
		goto err;

	creds = dmpi(item)->pi_creds;获取该树节点的信誉证书
	if (creds->s.seccomp_mode != SECCOMP_MODE_DISABLED) {
		pr_info("got seccomp mode %d for %d\n", creds->s.seccomp_mode, vpid(item));
		core->tc->has_seccomp_mode = true;
		core->tc->seccomp_mode = creds->s.seccomp_mode;

		if (creds->s.seccomp_mode == SECCOMP_MODE_FILTER) {
			core->tc->has_seccomp_filter = true;
			core->tc->seccomp_filter = creds->last_filter;
		}
	}

	strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
	core->tc->flags = stat->flags;
	core->tc->task_state = item->pid->state;
	core->tc->exit_code = 0;
    
	ret = parasite_dump_thread_leader_seized(ctl, pid, core);//根据ctl->args等获取pid相关的内核信息(安全传输层协议,用户互斥锁,调度策略,调度参数,优先级信号栈等等)存放到core对象中去
	if (ret)
		goto err;

	ret = dump_pid_misc(pid, core->tc);//存放/proc/pid/loginuid文件及/proc/pid/oom_score_adj文件信息到core->tc中去
	if (ret)
		goto err;

	ret = dump_task_rlimits(pid, core->tc->rlimits);//获取所有资源限制(虚拟内存,cpu,文件大小,锁,内存锁等等)到core->tc->rlimits对象中去(包括soft limit cur和hard limit max)
	if (ret)
		goto err;

	/* For now, we only need to dump the root task's cgroup ns, because we
	 * know all the tasks are in the same cgroup namespace because we don't
	 * allow nesting.我们只需要备份根任务的cgroup namespace,所有的任务的cgroup namespace信息都一样,因为我们不允许嵌套的namespaces
	 */
	if (item->ids->has_cgroup_ns_id && !item->parent) {
		info = &cgroup_args;
		ret = parasite_dump_cgroup(ctl, &cgroup_args);//首先通过发送rpc命令信息,然后通过打开/proc/self/cgroup的到文件描述符,再根据该描述符备份该文件的内容到cgroup_args->contents中去即可
		if (ret)
			goto err;
	}

	core->tc->has_cg_set = true;
	ret = dump_task_cgroup(item, &core->tc->cg_set, info);//备份线程cgroup信息,见详解,根据item_pid信息备份外部cgroup信息到全局变量cg_sets,&core->tc->cg_set设置为备份过程中得到的cgroup_set_id
	if (ret)
		goto err;

	img = img_from_set(cr_imgset, CR_FD_CORE);//获取镜像格式
	ret = pb_write_one(img, core, PB_CORE);//将core写入镜像
	if (ret < 0)
		goto err;

err:
	pr_info("----------------------------------------\n");

	return ret;
}
//将信号栈信息,父进程死亡信号值及信誉证书相关信息存放到ctl中的parasite_dump_thread args中去,进一步存放到core->thread_core->creds
//将parasite_dump_thread args中的内的信息存放到core->thread_core对象
//将备份得到的信息都保存在CoreEntry core对象中
int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core)
{
	ThreadCoreEntry *tc = core->thread_core;
	struct parasite_dump_thread *args;
	struct parasite_dump_creds *pc;
	int ret;

	args = compel_parasite_args(ctl, struct parasite_dump_thread);

	pc = args->creds;
	pc->cap_last_cap = kdat.last_cap;

	ret = compel_rpc_call_sync(PARASITE_CMD_DUMP_THREAD, ctl);//调用dump_thread,备份了信号栈信息,父进程死亡信号值,信誉证书相关信息到pc
	if (ret < 0)
		return ret;

	ret = alloc_groups_copy_creds(tc->creds, pc);pc中备份的信息复制到tc->creds中去
	if (ret) {
		pr_err("Can't copy creds for thread leader %d\n", pid);
		return -1;
	}

	return dump_thread_core(pid, core, args);//收集lsm_profile,(robust_list)健壮用户互斥锁链表,调度策略,调度参数,调度优先级(nice值),及args中的安全传输层协议和信号动作信息全部存放到core->thread_core对象中去
}


static int dump_thread(struct parasite_dump_thread *args)
{
	args->tid = sys_gettid();
	return dump_thread_common(args);
}

//备份了信号栈信息,父进程死亡信号值,信誉证书相关信息到ti
static int dump_thread_common(struct parasite_dump_thread *ti)
{
	int ret;

	arch_get_tls(&ti->tls);
	ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &ti->tid_addr, 0, 0, 0);//获取tid地址到&ti->tid_addr
	if (ret)
		goto out;

	ret = sys_sigaltstack(NULL, &ti->sas);//sys_sigaltstack用于设置或获取信号栈的信息,第一个参数为空,说明为获取信号栈的信息到&ti->sas中去
	if (ret)
		goto out;

	ret = sys_prctl(PR_GET_PDEATHSIG, (unsigned long)&ti->pdeath_sig, 0, 0, 0);//返回父进程死亡信号的值到&ti->pdeath_sig中去
	if (ret)
		goto out;

	ret = dump_creds(ti->creds);//备份信誉证书信息:包括权限信息,安全bit,uid及gid信息,并设置文件系统的uid gid为-1L
out:
	return ret;
}

//备份信誉证书信息:包括权限信息,安全bit,uid及gid信息,并设置文件系统的uid gid为-1L
static int dump_creds(struct parasite_dump_creds *args)
{
	int ret, i, j;
	struct cap_data data[_LINUX_CAPABILITY_U32S_3];//定义权限信息结构体,包括effective(权限集合),permitted(权限掩码),inheritable(是否可继承)
	struct cap_header hdr = {_LINUX_CAPABILITY_VERSION_3, 0};///定义权限版本,pid等信息

	ret = sys_capget(&hdr, data);//获取pid为0的权限信息到data
	if (ret < 0) {
		pr_err("Unable to get capabilities: %d\n", ret);
		return -1;
	}

	/*
	 * Loop through the capability constants until we reach cap_last_cap.
	 * The cap_bnd set is stored as a bitmask comprised of CR_CAP_SIZE number of
	 * 32-bit uints, hence the inner loop from 0 to 32.
	 */
	for (i = 0; i < CR_CAP_SIZE; i++) {//CR_CAP_SIZE==2
		args->cap_eff[i] = data[i].eff;
		args->cap_prm[i] = data[i].prm;
		args->cap_inh[i] = data[i].inh;
		args->cap_bnd[i] = 0;
    //将权限信息复制到args中去
		for (j = 0; j < 32; j++) {
			if (j + i * 32 > args->cap_last_cap)//获取到最后一个bit的值
				break;
			ret = sys_prctl(PR_CAPBSET_READ, j + i * 32, 0, 0, 0);//返回j+i*32所指定的权限bit是否存在于被调用的线程权限32bit中,存在返回1,不存在返回0
			if (ret < 0) {
				pr_err("Unable to read capability %d: %d\n",
					j + i * 32, ret);
				return -1;
			}
			if (ret)
				args->cap_bnd[i] |= (1 << j);//存在则将该位的bit存放到args->cap_bnd去
		}
	}

	args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);//返回调用线程的安全bits参数到args

	ret = sys_getgroups(0, NULL);//返回调用进程所有所属组的数目(因为第一个参数size==0
	if (ret < 0)
		goto grps_err;

	args->ngroups = ret;//将所有所属的组数量添加到args中去
	if (args->ngroups >= PARASITE_MAX_GROUPS) {
		pr_err("Too many groups in task %d\n", (int)args->ngroups);
		return -1;
	}

	ret = sys_getgroups(args->ngroups, args->groups);将所有所属组信息gid_t添加到args->groups中去
	if (ret < 0)
		goto grps_err;

	if (ret != args->ngroups) {
		pr_err("Groups changed on the fly %d -> %d\n",
				args->ngroups, ret);
		return -1;
	}

	ret = sys_getresuid(&args->uids[0], &args->uids[1], &args->uids[2]);//获取调用sys_getresuid的进程的real uid,effective uid,setUID(一般来说real uid和effective uid一样,而当使用了setUid时,如passwd,real uid为调用passwd的用户id,effective uid为对passwd具有权限的用户,root,然后将这三个uid存放到args中去)
	if (ret) {
		pr_err("Unable to get uids: %d\n", ret);
		return -1;
	}

	args->uids[3] = sys_setfsuid(-1L);//设置调用者文件系统用户id为-1L

	/*
	 * FIXME In https://github.com/xemul/criu/issues/95 it is
	 * been reported that only low 16 bits are set upon syscall
	 * on ARMv7.
	 *
	 * We may rather need implement builtin-memset and clear the
	 * whole memory needed here.
	 */
	args->gids[0] = args->gids[1] = args->gids[2] = args->gids[3] = 0;

	ret = sys_getresgid(&args->gids[0], &args->gids[1], &args->gids[2]);//同上
	if (ret) {
		pr_err("Unable to get uids: %d\n", ret);
		return -1;
	}

	args->gids[3] = sys_setfsgid(-1L);//设置调用者文件系统的gid为-1L

	return 0;

grps_err:
	pr_err("Error calling getgroups (%d)\n", ret);
	return -1;
}
//收集lsm_profile(Linux安全模块),(robust_list)健壮用户互斥锁链表,调度策略,调度参数,调度优先级(nice值),及安全传输层协议和信号动作信息全部存放到core->thread_core对象中去
int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)

{
	int ret;
	ThreadCoreEntry *tc = core->thread_core;

	ret = collect_lsm_profile(pid, tc->creds);//apparmor安全机制就是读取/proc/pid/attr/current下的文件到tc->creds->lsm_profile中去,selinux安全机制则是将该pid进程上下文unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023中的前三个:替换成0
	if (!ret) {
		/*
		 * XXX: It's possible to set two: 32-bit and 64-bit
		 * futex list's heads. That makes about no sense, but
		 * it's possible. Until we meet such application, dump
		 * only one: native or compat futex's list pointer.
		 */
		if (!core_is_compat(core))//根据core->thread_info->gpregs->mode判断是否兼容
			ret = get_task_futex_robust_list(pid, tc);//获取健壮的用户互斥锁链表头信息及长度存放到tc中去
		else
			ret = get_task_futex_robust_list_compat(pid, tc);
	}
	if (!ret)
		ret = dump_sched_info(pid, tc);//获取指定pid进程的调度策略和调度参数,及进程nice值优先级,然后存放到tc中去
	if (!ret) {
		core_put_tls(core, ti->tls);//将ti->tls安全传输层协议相关信息存放到core中去
		CORE_THREAD_ARCH_INFO(core)->clear_tid_addr =
			encode_pointer(ti->tid_addr);
		BUG_ON(!tc->sas);
		copy_sas(tc->sas, &ti->sas);//将ti中的信号动作拷贝到tc->sac中去
		if (ti->pdeath_sig) {
			tc->has_pdeath_sig = true;
			tc->pdeath_sig = ti->pdeath_sig;
		}
	}

	return ret;
}

dump_task_cgroup将从外部解析得到的cgroup信息链表添加到全局变量cg_sets中去备份

int dump_task_cgroup(struct pstree_item *item, u32 *cg_id, struct parasite_dump_cgroup_args *args)
{
	int pid;
	LIST_HEAD(ctls);
	unsigned int n_ctls = 0;
	struct cg_set *cs;

	if (item)
	
		pid = item->pid->real;
	else
		pid = getpid();

	pr_info("Dumping cgroups for %d\n", pid);
	if (parse_task_cgroup	(pid, args, &ctls, &n_ctls))//解析cgroup文件信息,包括寄生进程外部cgroup和寄生进程内部cgroup信息,主要存储的是外部cgroup每条数据中的name和path信息,及外部路径path的前缀长度(通过与内部路径path的比较得到的)
		return -1;

	cs = get_cg_set(&ctls, n_ctls, item);//根据cgroup链表ctls获取相关cgroup信息集cgroup_set(先从宏变量cg_sets中寻找,未发现则新建cgroup_set收集信息后添加到cg_sets中去),	返回该cgroup信息集
	if (!cs)
		return -1;

	if (!item) {
		BUG_ON(criu_cgset);
		criu_cgset = cs;
		pr_info("Set %d is criu one\n", cs->id);
	} else {
		if (item == root_item) {
			BUG_ON(root_cgset);
			root_cgset = cs;
			pr_info("Set %d is root one\n", cs->id);
		} else {
			struct cg_ctl *root, *stray;

			BUG_ON(!root_cgset);
			pr_info("Set %d is a stray\n", cs->id);

			/* Copy the cgns prefix from the root cgset for each
			 * controller. This is ok because we know that there is
			 * only one cgroup namespace.
			 */
			list_for_each_entry(root, &root_cgset->ctls, l) {
				list_for_each_entry(stray, &cs->ctls, l) {
					if (strcmp(root->name, stray->name))
						continue;

					if (strlen(stray->path) < root->cgns_prefix) {
						pr_err("cg %s shorter than path prefix %d?\n", stray->path, root->cgns_prefix);
						return -1;
					}

					stray->cgns_prefix = root->cgns_prefix;
				}
			}
		}
	}

	*cg_id = cs->id;
	return 0;
}
//解析任务的cgroup信息,从寄生进程外部得到的cgroup相关信息包括name和path等存放到retl链表中去,并通过比对寄生进程内部的path获取外部path与内部path的前缀长度,就可以通过外部path减去该长度得到内部path
int parse_task_cgroup(int pid, struct parasite_dump_cgroup_args *args, struct list_head *retl, unsigned int *n)
{
//传入的retl为空,且n=0
	FILE *f;
	int ret;
	LIST_HEAD(internal);
	unsigned int n_internal = 0;
	struct cg_ctl *intern, *ext;

	f = fopen_proc(pid, "cgroup");//打开指定pid的cgroup文件
	if (!f)
		return -1;

	ret = parse_cgroup_file(f, retl, n);//解析Cgroup文件,将解析得到信息存入以retl为链表头的链表,且排列顺序按照名称从小到大排列
	fclose(f);
	if (ret < 0)
		return -1;

	/* No parasite args, we're dumping criu's cg set, so we don't need to
	 * try and parse the "internal" cgroup set to find namespace
	 * boundaries.
	 */
	if (!args)
		return 0;

	f = fmemopen(args->contents, strlen(args->contents), "r");//建立一个内存流,把内存当成文件使用,返回文件描述符,但磁盘并没有对应的文件存储,因为是在内存中的数据,以文件的形式操作读取或写入,此处的args->contents是在前面步骤dump_task_core_all中将proc/self/cgroup的数据保存到args->contents。所以这里打开的内存流文件相当于是proc/self/cgroup中的文件
	if (!f) {
		pr_perror("couldn't fmemopen cgroup buffer %s", args->contents);
		return -1;
	}

	ret = parse_cgroup_file(f, &internal, &n_internal);//读取并解析proc/self/cgroup数据到链表internal中去,寄生进程内部的cgroup得到的内部路径,前面解析得到的retl是从寄生进程外部看得到的外部路径
	fclose(f);
	if (ret < 0) {
		pr_err("couldn't parse internal cgroup file\n");
		return -1;
	}

	/* Here's where we actually compute the cgns prefix. Consider a task
	 * in /foo/bar which has unshared its namespace at /foo. The internal
	 * path is /bar, but the external path is /foo/bar, and the cgns
	 * prefix is /foo. The algorithm is:
	 *
	 * // no cg ns unshare in this case
	 * if (internal == external)
	 *   continue;
	 * idx = find_suffix_pos(external, internal)
	 * cgns_prefix = external[:idx]
	 */
	list_for_each_entry(intern, &internal, l) {
		list_for_each_entry(ext, retl, l) {
			char *pos;

			if (strcmp(ext->name, intern->name))//得到外部和内部视角名称相同的两个节点
				continue;

			/* If the cgroup namespace was unshared at / (or there
			 * is no cgroup namespace relative to criu), the paths
			 * are equal and we don't need to set a prefix.
			 */
			if (!strcmp(ext->path, intern->path))//如果名称相同,路径不同,例如在外部看是/foo/bar,寄生进程内部则是/bar需要继续往下执行操作
				continue;

			/* +1 here to chop off the leading / */
			pos = ext->path + strlen(ext->path) - strlen(intern->path+1);//这里的pos相当于指向外部路径中与内部路径相同部分的起始地址(去掉头部‘/’)
			if (strcmp(pos, intern->path+1)) {
			//如果pos指向的该部分与内部路径不同,说明不合法
				pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path, ext->path);
				ret = -1;
				goto out;
			}

			ext->cgns_prefix = pos - ext->path;//相减得到外部地址与内部地址之间的前缀长度,外部地址减去该前缀长度即可得到内部地址
			if (ext->path[ext->cgns_prefix-1] == '/')
				ext->cgns_prefix--;
		}
	}

out:
	put_ctls(&internal);
	return ret;
}
//解析Cgroup文件,将解析得到信息存入以retl为链表头的链表,且排列顺序按照名称从小到大排列
int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)
{
	while (fgets(buf, BUF_SIZE, f)) {
	//循环读取cgroup文件每次读BUF_SIZE大小数据到buf中去
		struct cg_ctl *ncc, *cc;
		char *name, *path = NULL, *e;

		ncc = xmalloc(sizeof(*cc));
		if (!ncc)
			goto err;

		/*
		 * Typical output (':' is a separator here)
		 *:是分隔符
		 * 4:cpu,cpuacct:/
		 * 3:cpuset:/
		 * 2:name=systemd:/user.slice/user-1000.slice/session-1.scope
		 */
		name = strchr(buf, ':');//根据分隔符解析出名称
		if (name)
			path = strchr(++name, ':');//根据分隔符解析出路径
		if (!name || !path) {
			pr_err("Failed parsing cgroup %s\n", buf);
			xfree(ncc);
			goto err;
		}
		e = strchr(name, '\n');//根据转行符获取下一条cgroup数据
		*path++ = '\0';//将名称与路径之间的:替换成'\0'方便字符串读取
		if (e)
			*e = '\0';//将转行符替换成'\0'

		/*
		 * Controllers and their props might be
		 * configured the way some of them are
		 * not taken into the image for migration
		 * sake or container specifics.
		 */
		if (cgp_should_skip_controller(name)) {
			pr_debug("cg-prop: Skipping controller %s\n", name);
			xfree(ncc);
			continue;
		}

		ncc->name = xstrdup(name);/namepath存放到cg_ctl ncc中去
		ncc->path = xstrdup(path);
		ncc->cgns_prefix = 0;
		if (!ncc->name || !ncc->path) {
			xfree(ncc->name);
			xfree(ncc->path);
			xfree(ncc);
			goto err;
		}

		list_for_each_entry(cc, retl, l)
			if (strcmp(cc->name, name) >= 0)//将这一次遍历得到的name与之前得到的name进行比较,若大于或等于则中断比较,然后将该name插入到合适的地方
				break;

		list_add_tail(&ncc->l, &cc->l);//这里就是插入操作将得到的name和path放到链表结构体并插入刚刚经过比较的合适链表节点后面,最后得到的cgroup信息链表是根据名称从小到大排列
		(*n)++;
	}

	return 0;

err:
	put_ctls(retl);
	return -1;
}
//查看全局变量cg_sets中是否已经存在该cgroup信息ctls,存在即返回,不存在,新建一个cg_set对象,并收集cgroup相关信息,存放到宏变量cg_sets中
static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls, bool collect)
{
//传进来的ctls是外部cgroup文件内的信息链表结构,n_ctls为链表节点个数
	struct cg_set *cs;

	list_for_each_entry(cs, &cg_sets, l)
	//查询宏链表cg_sets是否已经存在该cgroup信息链表,存在则将该链表清空free并返回
		if (cg_set_compare(cs, ctls, CGCMP_MATCH)) {
			pr_debug(" `- Existing css %d found\n", cs->id);
			put_ctls(ctls);
			return cs;
		}

	pr_debug(" `- New css ID %d\n", cg_set_ids);//cg_set_ids为宏变量,从1开始
	cs = xmalloc(sizeof(*cs));//新建一个cgroup_Set
	if (cs) {
		cs->id = cg_set_ids++; //该cgroup-set的值为当前cg_set_ids的值
		INIT_LIST_HEAD(&cs->ctls);//初始化链表头
		list_splice_init(ctls, &cs->ctls);//将ctls链表和&cs->ctls头节点合并后将整个链表设置为空链,该空链以cs->ctls为表头。
		cs->n_ctls = n_ctls;//设置链表节点数量
		list_add_tail(&cs->l, &cg_sets);//将该cgroup_set添加到宏变量cg_sets集合中存储
		n_sets++;//添加cg_sets的cgroup信息链表个数

		if (!pr_quelled(LOG_DEBUG)) {
			struct cg_ctl *ctl;

			list_for_each_entry(ctl, &cs->ctls, l)
				pr_debug("    `- [%s] -> [%s] [%u]\n", ctl->name, ctl->path, ctl->cgns_prefix);
		}

		if (collect && collect_cgroups(&cs->ctls)) {
		//将之前初始化的空链传进去收集cgroup信息
			list_del(&cs->l);
			n_sets--;
			put_ctls(&cs->ctls);
			xfree(cs);
			return NULL;
		}
	}

	return cs;
}
//通过宏变量Cgroup中的信息匹配名称name确定current_controller,再根据ctls中每个节点的name确定文件系统参数mopts,通过挂载及取消挂载的操作获得指定fd,即可得到"/proc/self/fd/%d(fd)"目录路径,遍历该路径执行添加cgroup函数
//收集相关cgroup信息到ctls中
static int collect_cgroups(struct list_head *ctls)
{
//ctls为传进来的空链
	struct cg_ctl *cc;
	int ret = 0;
	int fd = -1;

	list_for_each_entry(cc, ctls, l) {
		char path[PATH_MAX], mopts[1024], *root;
		char prefix[] = ".criu.cgmounts.XXXXXX";
		struct cg_controller *cg;
		struct cg_root_opt *o;

		current_controller = NULL;

		/* We should get all the "real" (i.e. not name=systemd type)
		 * controller from parse_cgroups(), so find that controller if
		 * it exists. */
		list_for_each_entry(cg, &cgroups, l) {
		//&cgroups由cr_dump_tasks中前面步骤中的parse_cg_info步骤解析得到的链表,我们在下一章节会对该步骤进行补充解释
			if (cgroup_contains(cg->controllers, cg->n_controllers, cc->name)) {
			//匹配目前得到的cgroup信息和之前parse_cg_info步骤中得到的cg->controllers中的名称name是否匹配
				current_controller = cg;//匹配就得到current_controller对象信息
				break;
			}
		}

		if (!current_controller) {
			/* only allow "fake" controllers to be created this way */
			if (!strstartswith(cc->name, "name=")) {
				pr_err("controller %s not found\n", cc->name);
				return -1;
			} else {
				struct cg_controller *nc;

				nc = new_controller(cc->name);
				if (!nc)
					return -1;
				list_add_tail(&nc->l, &cg->l);
				n_cgroups++;
				current_controller = nc;
			}
		}

		if (!opts.manage_cgroups)
			continue;

		if (strstartswith(cc->name, "name="))//设置mopts对象信息
			snprintf(mopts, sizeof(mopts), "none,%s", cc->name);
		else
			snprintf(mopts, sizeof(mopts), "%s", cc->name);

		if (mkdtemp(prefix) == NULL) {//创建唯一临时目录名,prefix必须以XXXXXX结尾
			pr_perror("can't make dir for cg mounts");
			return -1;
		}

		if (mount("none", prefix, "cgroup", 0, mopts) < 0) {
		//将none文件系统挂载到prefix唯一临时目录上去,且文件系统特有参数为none,cc->name或cc->name
			pr_perror("couldn't mount %s", mopts);
			rmdir(prefix);
			return -1;
		}

		fd = open_detach_mount(prefix);//取消prefix临时目录的挂载,fd为打开该目录返回的文件描述符
		if (fd < 0)
			return -1;

		path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd);//给变量path赋值,返回path长度到宏变量path_pref_len

		root = cc->path;//
		if (opts.new_global_cg_root)
			root = opts.new_global_cg_root;

		list_for_each_entry(o, &opts.new_cgroup_roots, node) {
			if (!strcmp(cc->name, o->controller))
				root = o->newroot;
		}

		snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", root);

		ret = ftw(path, add_cgroup, 4);//从当前path指定目录递归遍历子目录
		if (ret < 0)
			pr_perror("failed walking %s for empty cgroups", path);

		close_safe(&fd);

		if (ret < 0)
			return ret;

		if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") &&
				add_freezer_state(current_controller))
			return -1;
	}

	return 0;
}
//首先将该目录路径"/proc/self/fd/%d(fd)"与current_controller匹配path成功匹配则直接返回,匹配失败添加属性到ncd->properties及宏变量cgp_global
static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag)
{
//传进来的三个参数,fpath为当时指向的目录指针即"/proc/self/fd/%d"
	struct cgroup_dir *ncd = NULL, *match;
	int exit_code = -1;

	if (typeflag == FTW_D) {
	//FTW_D表示为目录类型
		int mtype;

		pr_info("adding cgroup %s\n", fpath);

		ncd = xmalloc(sizeof(*ncd));
		if (!ncd)
			goto out;

		ncd->mode = sb->st_mode;
		ncd->uid = sb->st_uid;
		ncd->gid = sb->st_gid;

		/* chop off the first "/proc/self/fd/N" str */
		if (fpath[path_pref_len] == '\0')
			ncd->path = xstrdup("/");//ncd->path赋值为fpath复制后的首地址
		else
			ncd->path = xstrdup(fpath + path_pref_len);

		if (!ncd->path)
			goto out;

		mtype = find_dir(ncd->path, &current_controller->heads, &match);//匹配cgroup中路径和ncd->path相同的,mtype为返回的匹配情况

		switch (mtype) {
		/* ignore co-mounted cgroups and already dumped cgroups */
		case EXACT_MATCH:// ncd->path正好匹配
			exit_code = 0;
			goto out;
		case PARENT_MATCH://ncd->path与子目录匹配
			list_add_tail(&ncd->siblings, &match->children);
			match->n_children++;
			break;
		case NO_MATCH://匹配失败
			list_add_tail(&ncd->siblings, &current_controller->heads);
			current_controller->n_heads++;
			break;
		default:
			BUG();
		}

		INIT_LIST_HEAD(&ncd->children);
		ncd->n_children = 0;

		INIT_LIST_HEAD(&ncd->properties);
		ncd->n_properties = 0;
		if (add_cgroup_properties(fpath, ncd, current_controller) < 0) {//添加对应名称cgroup属性值到ncd->properties链表及宏变量属性cgp_global对象中去
			list_del(&ncd->siblings);
			if (mtype == PARENT_MATCH)
				match->n_children--;
			else if (mtype == NO_MATCH)
				current_controller->n_heads--;
			goto out;
		}
	}

	return 0;

out:
	if (ncd)
		xfree(ncd->path);
	xfree(ncd);
	return exit_code;
}

//根据在cgp_init中的得到的宏变量cgp_list对象中存储的cgroup属性对象进行匹配得到对应的属性节点cgp,然后再根据fpath/cgp->props[]指定的路径访问相应的属性值并保存到ncd->properties链表结构中去及全局宏变量cgp_global
static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd,
				 struct cg_controller *controller)
{
	int i;

	for (i = 0; i < controller->n_controllers; ++i) {
	//遍历所有的控制器
		const cgp_t *cgp = cgp_get_props(controller->controllers[i]);
		//从宏变量cgp_list链表头开始遍历,匹配name,匹配成功返回cgp_list->cgp,cgp_list在cr_dump_tasks之前的步骤cgp_init初始化的过程中已被赋值。

		if (dump_cg_props_array(fpath, ncd, cgp) < 0) {
		//通过fpath/cgp->props[]访问cgroup属性,然后添加到ncd->properties链表结构中去
			pr_err("dumping known properties failed\n");
			return -1;
		}

		if (dump_cg_props_array(fpath, ncd, &cgp_global) < 0) {
		//还添加一份到全局的cgroup属性对象cgp_global中去
			pr_err("dumping global properties failed\n");
			return -1;
		}
	}

	return 0;
}