"criu dump重要步骤的深入研究九"

  "criu checkpoint十二"

Posted by Xu on July 2, 2017

CRIU,DUMP重要步骤的深入研究

今天进入dump_one_task的收尾阶段:

	ret = compel_stop_daemon(parasite_ctl);//强制暂停后台进程模式
	if (ret) {
		pr_err("Can't cure (pid: %d) from parasite\n", pid);
		goto err;
	}

	ret = dump_task_threads(parasite_ctl, item);//再次收集所有线程的内核信息并写入到镜像中去。
	if (ret) {
		pr_err("Can't dump threads\n");
		goto err;
	}

	ret = compel_cure(parasite_ctl);//首先暂停寄生进程的后台模式,然后解除内存映射:包括寄生进程的内存映射和本地的内存映射
	if (ret) {
		pr_err("Can't cure (pid: %d) from parasite\n", pid);
		goto err;
	}

	ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset);//将所有虚拟内存的信息及auxv和exe文件下的信息保存到MmEntry结构体中并写入镜像
	if (ret) {
		pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret);
		goto err;
	}

	ret = dump_task_fs(pid, &misc, cr_imgset);//备份线程文件系统信息,包括cwd目录下及root目录下的文件信息写入镜像
	if (ret) {
		pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret);
		goto err;
	}

备份线程信息,再次收集所有线程的内核信息并写入到镜像中去,这里不知道为什么要重复收集线程内核信息

static int dump_task_threads(struct parasite_ctl *parasite_ctl,
			     const struct pstree_item *item)
{
	int i;

	for (i = 0; i < item->nr_threads; i++) {
	//根据之前备份的所有线程相关的内核信息到该item节点对象中threads去,然后遍历该线程进行备份
		/* Leader is already dumped */
		if (item->pid->real == item->threads[i].real) {
			item->threads[i].ns[0].virt = vpid(item);
			continue;
		}
		if (dump_task_thread(parasite_ctl, item, i))//收集内核信息及信号和寄存器信息到core对象然后写入镜像保存
			return -1;
	}

	return 0;
}


static int dump_task_thread(struct parasite_ctl *parasite_ctl,
				const struct pstree_item *item, int id)
{
	struct pid *tid = &item->threads[id];//获取线程id信息
	CoreEntry *core = item->core[id];//获取线程内核信息结构体
	pid_t pid = tid->real;
	int ret = -1;
	struct cr_img *img;

	pr_info("\n");
	pr_info("Dumping core for thread (pid: %d)\n", pid);
	pr_info("----------------------------------------\n");
   //在tid指定线程中运行寄生线程收集信号信息及寄存器信息,最后收集指定线程的内核信息到core中
	ret = parasite_dump_thread_seized(parasite_ctl, id, tid, core);
	if (ret) {
		pr_err("Can't dump thread for pid %d\n", pid);
		goto err;
	}
	pstree_insert_pid(tid);//在pid_root_rb红黑树结构中找到合适的位置插入tid结构
    //将内核信息写入镜像
	img = open_image(CR_FD_CORE, O_DUMP, tid->ns[0].virt);
	if (!img)
		goto err;

	ret = pb_write_one(img, core, PB_CORE);

	close_image(img);
err:
	pr_info("----------------------------------------\n");
	return ret;
}
//首先启动在指定线程中运行寄生线程收集信号信息及寄存器信息,然后收集指定线程的内核信息到core中
int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id,
				struct pid *tid, CoreEntry *core)
{
//传进来的id为线程在该进程节点中的下标,tid为该线程的id对象,core为该进程的内核信息
	struct parasite_dump_thread *args;
	pid_t pid = tid->real;
	ThreadCoreEntry *tc = core->thread_core;
	CredsEntry *creds = tc->creds;
	struct parasite_dump_creds *pc;
	int ret;
	struct parasite_thread_ctl *tctl;

	BUG_ON(id == 0); /* Leader is dumped in dump_task_core_all */

	args = compel_parasite_args(ctl, struct parasite_dump_thread);

	pc = args->creds;
	pc->cap_last_cap = kdat.last_cap;

	tctl = compel_prepare_thread(ctl, pid);//准备寄生线程的信号掩码及寄存器信息到tctl对象
	if (!tctl)
		return -1;

	tc->has_blk_sigset = true;
	memcpy(&tc->blk_sigset, compel_thread_sigmask(tctl), sizeof(k_rtsigset_t));//将信号掩码集拷贝到&tc->blk_sigset

	ret = compel_run_in_thread(tctl, PARASITE_CMD_DUMP_THREAD);//强制在该线程中运行寄生线程然后收集信号信息及寄存器信息
	if (ret) {
		pr_err("Can't init thread in parasite %d\n", pid);
		goto err_rth;
	}

	ret = alloc_groups_copy_creds(creds, pc);//拷贝pc中的信誉证书等信息到creds中
	if (ret) {
		pr_err("Can't copy creds for thread %d\n", pid);
		goto err_rth;
	}

	ret = compel_get_thread_regs(tctl, save_task_regs, core);//获取线程寄存器信息到core对象中
	if (ret) {
		pr_err("Can't obtain regs for thread %d\n", pid);
		goto err_rth;
	}

	compel_release_thread(tctl);//释放tctl内存空间

	tid->ns[0].virt = args->tid;
	return dump_thread_core(pid, core, args);//收集pid指定的线程内核信息,收集lsm_profile(Linux安全模块),(robust_list)健壮用户互斥锁链表,调度策略,调度参数,调度优先级(nice值),及安全传输层协议和信号动作信息全部存放到core->thread_core对象中去

err_rth:
	compel_release_thread(tctl);
	return -1;
}

int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd)
{
	int pid = tctl->tid;
	struct parasite_ctl *ctl = tctl->ctl;
	struct thread_ctx *octx = &tctl->th;
	void *stack = ctl->r_thread_stack;
	user_regs_struct_t regs = octx->regs;
	int ret;

	*ctl->addr_cmd = cmd;

	ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, &regs, octx);//屏蔽所有该线程的信号并设置寄存器对象为该regs指定寄存器然后重启运行
	if (ret == 0)
		ret = parasite_trap(ctl, pid, &regs, octx);//等待该线程退出后收集该线程的信号信息及寄存器信息
	if (ret == 0)
		ret = (int)REG_RES(regs);

	if (ret)
		pr_err("Parasite exited with %d\n", ret);

	return ret;
}
//设置pid指定的线程对象后,设置寄存器并屏蔽所有信号后重新启动
static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack,
		user_regs_struct_t *regs, struct thread_ctx *octx)
{
	k_rtsigset_t block;

	ksigfillset(&block);
	if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) {
	//设置信号掩码全为-1,相当于阻塞所有信号
		pr_perror("Can't block signals for %d", pid);
		goto err_sig;
	}

	parasite_setup_regs(ip, stack, regs);//安装寄存器
	if (ptrace_set_regs(pid, regs)) {//设置线程寄存器为该regs寄存器
		pr_perror("Can't set registers for %d", pid);
		goto err_regs;
	}

	if (ptrace(cmd, pid, NULL, NULL)) {//重启pid线程
		pr_perror("Can't run parasite at %d", pid);
		goto err_cont;
	}

	return 0;

err_cont:
	if (ptrace_set_regs(pid, &octx->regs))
		pr_perror("Can't restore regs for %d", pid);
err_regs:
	if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask))
		pr_perror("Can't restore sigmask for %d", pid);
err_sig:
	return -1;
}

dump_task_mm将所有虚拟内存的信息及auxv和exe文件下的信息保存到MmEntry结构体中并写入镜像

//将所有虚拟内存的信息及auxv和exe文件下的信息保存到MmEntry结构体中并写入镜像
static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
		const struct parasite_dump_misc *misc,
		const struct vm_area_list *vma_area_list,
		const struct cr_imgset *imgset)
{
	MmEntry mme = MM_ENTRY__INIT;//定义mm入口
	struct vma_area *vma_area;
	int ret = -1, i = 0;

	pr_info("\n");
	pr_info("Dumping mm (pid: %d)\n", pid);
	pr_info("----------------------------------------\n");

	mme.n_vmas = vma_area_list->nr;//定义虚拟内存块个数
	mme.vmas = xmalloc(mme.n_vmas * sizeof(VmaEntry *));//分配虚拟内存的内存空间
	if (!mme.vmas)
		return -1;

	list_for_each_entry(vma_area, &vma_area_list->h, list) {
	//遍历所有的虚拟内存
		VmaEntry *vma = vma_area->e;

		pr_info_vma(vma_area);

		if (!vma_entry_is(vma, VMA_AREA_REGULAR))//该虚拟内存是否是常规虚拟内存
			ret = 0;
		else if (vma_entry_is(vma, VMA_AREA_SYSVIPC))//该虚拟内存是否是SYSVIPC通信的内存区域
			ret = check_sysvipc_map_dump(pid, vma);
		else if (vma_entry_is(vma, VMA_AREA_SOCKET))//该虚拟内存是否是socket区域
			ret = dump_socket_map(vma_area);
		else
			ret = 0;
		if (ret)
			goto err;

		mme.vmas[i++] = vma;

		if (vma_entry_is(vma, VMA_AREA_AIORING)) {//该虚拟内存块是否是异步IO环
			ret = dump_aio_ring(&mme, vma_area);
			if (ret)
				goto err;
		}
	}

	mme.mm_start_code = stat->start_code;
	mme.mm_end_code = stat->end_code;
	mme.mm_start_data = stat->start_data;
	mme.mm_end_data = stat->end_data;
	mme.mm_start_stack = stat->start_stack;
	mme.mm_start_brk = stat->start_brk;

	mme.mm_arg_start = stat->arg_start;
	mme.mm_arg_end = stat->arg_end;
	mme.mm_env_start = stat->env_start;
	mme.mm_env_end = stat->env_end;

	mme.mm_brk = misc->brk;

	mme.dumpable = misc->dumpable;
	mme.has_dumpable = true;

	mme.n_mm_saved_auxv = AT_VECTOR_SIZE;
	mme.mm_saved_auxv = xmalloc(pb_repeated_size(&mme, mm_saved_auxv));
	if (!mme.mm_saved_auxv)
		goto err;

	if (get_task_auxv(pid, &mme))//获取pid指定线程的auxv文件信息保存到mme对象中去
		goto err;

	if (dump_task_exe_link(pid, &mme))//打开指定pid下的exe文件存储相关信息到镜像中
		goto err;

	ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM);//将mme中的内容写入镜像
	xfree(mme.mm_saved_auxv);
	free_aios(&mme);
err:
	xfree(mme.vmas);
	return ret;
}