"criu dump重要步骤的深入研究五"

  "criu checkpoint八"

Posted by Xu on June 16, 2017

CRIU,DUMP备份步骤解析

经过四个小节的信息收集过程分析,这一章终于要展开criu核心部分的研究,即第二个部分备份过程:这个过程分为七个小节,我们从第一小节开始:对每个树节点的备份,这也是我们今天所要介绍的内容

在cr-dump.c的备份总函数入口cr_dump_tasks中,这个步骤的源码如下:

  
  for_each_pstree_item(item) {
  //遍历每一个进程树节点,执行dump_one_task操作
		if (dump_one_task(item))
			goto err;
	}
	
  

单节点备份:dump_one_task(item)

  static int dump_one_task(struct pstree_item *item)
{
	pid_t pid = item->pid->real;//获取该进程树节点的pid值
	struct vm_area_list vmas;//虚拟内存区域?
	struct parasite_ctl *parasite_ctl;//寄生程序控制器?
	int ret, exit_code = -1;
	struct parasite_dump_misc misc;//寄生备份盘?
	struct cr_imgset *cr_imgset = NULL;//cr镜像集
	struct parasite_drain_fd *dfds = NULL;//寄生管道文件描述符?
	struct proc_posix_timers_stat proc_args;
	struct mem_dump_ctl mdc;//内存备份控制器

	INIT_LIST_HEAD(&vmas.h);//初始化虚拟内存区域链表头
	vmas.nr = 0;

	pr_info("========================================\n");
	pr_info("Dumping task (pid: %d)\n", pid);
	pr_info("========================================\n");

	if (item->pid->state == TASK_DEAD)
	//如果该进程节点状态为僵死状态,则移交到dump_zombies步骤处理,这里直接忽略
		/*
		 * zombies are dumped separately in dump_zombies()
		 */
		return 0;

	pr_info("Obtaining task stat ... \n");
	ret = parse_pid_stat(pid, &pps_buf);
	//根据pid打开/proc下指定的stat文件,读取该文件中的内容,将该读取的内容buf ,sscanf到对应的状态信息结构体pps_buf中去。解析成功
	if (ret < 0)
		goto err;

	ret = collect_mappings(pid, &vmas, dump_filemap);
	//收集虚拟内存映射关系,通过读取/proc下指定pid的smap文件的内容填充到vmas链表结构体中
	if (ret) {
		pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret);
		goto err;
	}

	if (!shared_fdtable(item)) {
		dfds = xmalloc(sizeof(*dfds));//为寄生管道文件描述符分配内存
		if (!dfds)
			goto err;

		ret = collect_fds(pid, &dfds);//通过读取/proc文件下指定pid的fd文件目录,对fd个数进行计数,并记录信息到dfds结构体
		if (ret) {
			pr_err("Collect fds (pid: %d) failed with %d\n", pid, ret);
			goto err;
		}

		parasite_ensure_args_size(drain_fds_size(dfds));
	}

	ret = parse_posix_timers(pid, &proc_args);//解析posix timers,读取/proc/pid/timer文件,将读取的内容填充到proc_posix_timer结构体并添加到proc_args->timer对象
	if (ret < 0) {
		pr_err("Can't read posix timers file (pid: %d)\n", pid);
		goto err;
	}

	parasite_ensure_args_size(posix_timers_dump_size(proc_args.timer_n));

	ret = dump_task_signals(pid, item);
	//备份进程的线程信号,然后备份共享信号对象,调用两次(一次线程,一次共享信号)dump_signal_queue()进一步调用ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si);检索所有信号,并将信号全部拷贝到si结构体中去,并返回检索到的信号数目,然后根据si中的信号信息填充queue信号队列并返回到树节点结构体的相应信号部分属性。
	if (ret) {
		pr_err("Dump %d signals failed %d\n", pid, ret);
		goto err;
	}

	parasite_ctl = parasite_infect_seized(pid, item, &vmas);
	if (!parasite_ctl) {
		pr_err("Can't infect (pid: %d) with parasite\n", pid);
		goto err;
	}

	if (fault_injected(FI_DUMP_EARLY)) {
		pr_info("fault: CRIU sudden detach\n");
		BUG();
	}

	if (root_ns_mask & CLONE_NEWPID && root_item == item) {
		int pfd;

		pfd = parasite_get_proc_fd_seized(parasite_ctl);
		if (pfd < 0) {
			pr_err("Can't get proc fd (pid: %d)\n", pid);
			goto err_cure_imgset;
		}

		if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0)
			goto err_cure_imgset;

		close(pfd);
	}

	ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas);
	if (ret) {
		pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid);
		goto err_cure_imgset;
	}

	ret = parasite_collect_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */
	if (ret) {
		pr_err("Failed to check aio rings (pid: %d)\n", pid);
		goto err_cure_imgset;
	}

	ret = parasite_dump_misc_seized(parasite_ctl, &misc);
	if (ret) {
		pr_err("Can't dump misc (pid: %d)\n", pid);
		goto err_cure_imgset;
	}

	item->pid->ns[0].virt = misc.pid;
	pstree_insert_pid(item->pid);
	item->sid = misc.sid;
	item->pgid = misc.pgid;

	pr_info("sid=%d pgid=%d pid=%d\n",
		item->sid, item->pgid, vpid(item));

	if (item->sid == 0) {
		pr_err("A session leader of %d(%d) is outside of its pid namespace\n",
			item->pid->real, vpid(item));
		goto err_cure;
	}

	cr_imgset = cr_task_imgset_open(vpid(item), O_DUMP);
	if (!cr_imgset)
		goto err_cure;

	ret = dump_task_ids(item, cr_imgset);
	if (ret) {
		pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret);
		goto err_cure;
	}

	if (dfds) {
		ret = dump_task_files_seized(parasite_ctl, item, dfds);
		if (ret) {
			pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
			goto err_cure;
		}
	}

	mdc.pre_dump = false;

	ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
	if (ret)
		goto err_cure;

	ret = parasite_dump_sigacts_seized(parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid);
		goto err_cure;
	}

	ret = parasite_dump_itimers_seized(parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump itimers (pid: %d)\n", pid);
		goto err_cure;
	}

	ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump posix timers (pid: %d)\n", pid);
		goto err_cure;
	}

	ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset);
	if (ret) {
		pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
		goto err_cure;
	}

	ret = compel_stop_daemon(parasite_ctl);
	if (ret) {
		pr_err("Can't cure (pid: %d) from parasite\n", pid);
		goto err;
	}

	ret = dump_task_threads(parasite_ctl, item);
	if (ret) {
		pr_err("Can't dump threads\n");
		goto err;
	}

	ret = compel_cure(parasite_ctl);
	if (ret) {
		pr_err("Can't cure (pid: %d) from parasite\n", pid);
		goto err;
	}

	ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset);
	if (ret) {
		pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret);
		goto err;
	}

	ret = dump_task_fs(pid, &misc, cr_imgset);
	if (ret) {
		pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret);
		goto err;
	}

	close_cr_imgset(&cr_imgset);
	exit_code = 0;
err:
	close_pid_proc();
	free_mappings(&vmas);
	xfree(dfds);
	return exit_code;

err_cure:
	close_cr_imgset(&cr_imgset);
err_cure_imgset:
	compel_cure(parasite_ctl);
	goto err;
}

  
  

parasite

第一个部分:寄生感染进程获取寄生控制单元

  
  struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item,
		struct vm_area_list *vma_area_list)
{
//寄生进程感染指定进程
	struct parasite_ctl *ctl;//寄生进程控制器
	struct infect_ctx *ictx;//感染上下文对象
	unsigned long p;

	BUG_ON(item->threads[0].real != pid);

	p = get_exec_start(vma_area_list);//获取一个虚拟内存起始地址
	if (!p) {
		pr_err("No suitable VM found\n");
		return NULL;
	}

	ctl = compel_prepare_noctx(pid);//强制准备上下文,首先为一个寄生进程控制器分配内存,调用ptrace获取该线程的信号掩码,并将信号掩码填充到thread_ctx上下文对象结构体,然后调用ptrace获取寄存器内的值并存放到指定上下文对象结构体中寄存器部分
	if (!ctl)
		return NULL;

	ictx = compel_infect_ctx(ctl);//直接返回寄生进程控制器模块中的感染上下文对象结构体ictx

	ictx->open_proc = do_open_proc;
	ictx->child_handler = sigchld_handler;
	ictx->orig_handler.sa_handler = SIG_DFL;
	ictx->orig_handler.sa_flags = SA_SIGINFO | SA_RESTART;
	sigemptyset(&ictx->orig_handler.sa_mask);
	sigaddset(&ictx->orig_handler.sa_mask, SIGCHLD);
	ictx->sock = dmpi(item)->netns->net.seqsk;
	ictx->save_regs = save_task_regs;
	ictx->make_sigframe = make_sigframe;
	ictx->regs_arg = item->core[0];
	ictx->task_size = kdat.task_size;
	ictx->syscall_ip = p;
	pr_debug("Parasite syscall_ip at %#lx\n", p);
	

	if (fault_injected(FI_NO_MEMFD))
		ictx->flags |= INFECT_NO_MEMFD;
	if (fault_injected(FI_PARASITE_CONNECT))
		ictx->flags |= INFECT_FAIL_CONNECT;
	if (fault_injected(FI_NO_BREAKPOINTS))
		ictx->flags |= INFECT_NO_BREAKPOINTS;
	if (kdat.compat_cr)
		ictx->flags |= INFECT_COMPATIBLE;

	ictx->log_fd = log_get_fd();
//以上对感染上下文对象结构体ictx进行一系列设置
	parasite_setup_c_header(ctl);

	parasite_ensure_args_size(dump_pages_args_size(vma_area_list));
	parasite_ensure_args_size(aio_rings_args_size(vma_area_list));

	if (compel_infect(ctl, item->nr_threads, parasite_args_size) < 0) {
		compel_cure(ctl);
		return NULL;
	}

	parasite_args_size = PARASITE_ARG_SIZE_MIN; /* reset for next task */
	memcpy(&item->core[0]->tc->blk_sigset, compel_task_sigmask(ctl), sizeof(k_rtsigset_t));
	dmpi(item)->parasite_ctl = ctl;

	return ctl;
}

强制感染指定pid进程compel_infect(),并使得其启动daemon模式接受sock接受到的消息

int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size)
{
	int ret;
	unsigned long p, map_exchange_size, parasite_size = 0;

	if (ctl->pblob.parasite_type != COMPEL_BLOB_CHEADER)
		goto err;

	if (ctl->ictx.log_fd < 0)
		goto err;

	if (!arch_can_dump_task(ctl))
		goto err;

	/*
	 * Inject a parasite engine. Ie allocate memory inside alien
	 * space and copy engine code there. Then re-map the engine
	 * locally, so we will get an easy way to access engine memory
	 * without using ptrace at all.
	 */
      //注入一个寄生引擎,在陌生的空间分配内存并把引擎代码复制到该空间,然后重新本地映射引擎,所以我们将获得一个非常轻松的方式在不使用ptrace的情况下访问引擎内存
	parasite_size = total_pie_size(ctl->pblob.hdr.bsize);//向上四舍五入

	ctl->args_size = round_up(args_size, PAGE_SIZE);//向上四舍五入
	parasite_size += ctl->args_size;

	map_exchange_size = parasite_size;
	map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE;
	if (nr_threads > 1)
		map_exchange_size += PARASITE_STACK_SIZE;

	ret = compel_map_exchange(ctl, map_exchange_size);//见第二部分详解:实现了本地进程和被感染进程的内存文件共享映射
	if (ret)
		goto err;
     //成功构建内存映射,本地进程与被感染进程的内存映射
	pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map);

	ctl->parasite_ip = (unsigned long)(ctl->remote_map + ctl->pblob.hdr.parasite_ip_off);
	ctl->addr_cmd = ctl->local_map + ctl->pblob.hdr.addr_cmd_off;
	ctl->addr_args = ctl->local_map + ctl->pblob.hdr.addr_arg_off;

	memcpy(ctl->local_map, ctl->pblob.hdr.mem, ctl->pblob.hdr.bsize);
	if (ctl->pblob.hdr.nr_relocs)
		compel_relocs_apply(ctl->local_map, ctl->remote_map, ctl->pblob.hdr.bsize,
				    ctl->pblob.hdr.relocs, ctl->pblob.hdr.nr_relocs);

	p = parasite_size;

	ctl->rsigframe	= ctl->remote_map + p;
	ctl->sigframe	= ctl->local_map  + p;

	p += RESTORE_STACK_SIGFRAME;
	p += PARASITE_STACK_SIZE;
	ctl->rstack = ctl->remote_map + p;

	if (nr_threads > 1) {
		p += PARASITE_STACK_SIZE;
		ctl->r_thread_stack = ctl->remote_map + p;
	}

	ret = arch_fetch_sas(ctl, ctl->rsigframe);///让被感染进程调用sigaltstack设置可替换信号栈,用于信号处理
	if (ret) {
		pr_err("Can't fetch sigaltstack for task %d (ret %d)\n",
		       ctl->rpid, ret);
		goto err;
	}

	if (parasite_start_daemon(ctl))//见第三部分源码分析:寄生进程启动后台服务模式,监听套接字消息
		goto err;

	return 0;

err:
	return -1;
}

第二部分:内存文件共享映射的实现

static int compel_map_exchange(struct parasite_ctl *ctl, unsigned long size)
{
	int ret;

	ret = parasite_memfd_exchange(ctl, size);//交换内存文件描述符
	if (ret == 1) {
		pr_info("MemFD parasite doesn't work, goto legacy mmap\n");
		ret = parasite_mmap_exchange(ctl, size);
	}
	return ret;
}

*在被感染的进程中创建内存文件描述符,并且命名为CRIUMFD,然后将该内存文件描述符与被感染进程的内存空间进行映射

*同时将该内存文件描述符与本地内存进行映射,所以被感染进程的内存与本地内存构成映射关系,故可以检测被感染进程的内存

*变化情况,相当于成功安装寄生源。

/*
*在被感染的进程中创建内存文件描述符,并且命名为CRIUMFD,然后将该内存文件描述符与被感染进程的内存空间进行映射
*同时将该内存文件描述符与本地内存进行映射,所以被感染进程的内存与本地内存构成映射关系,故可以检测被感染进程的内存
*变化情况,相当于成功安装寄生源。
*/
static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size)
{
	void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE;//交换的目的地址,ictx.syscall_ip为感染的入口点
	uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME;//交换的原始地址MEMFD_FNAME: "CRIUMFD"
	pid_t pid = ctl->rpid;
	long sret = -ENOSYS;
	int ret, fd, lfd;
	bool __maybe_unused compat_task = !compel_mode_native(ctl);

	if (ctl->ictx.flags & INFECT_NO_MEMFD)
		return 1;

	BUILD_BUG_ON(sizeof(orig_code) < sizeof(long));

	if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) {
	//将内存地址为where(ictx.syscall_ip+BUILTIN_SYSCALL_SIZE)和orig_code的内容数据互换
	
		pr_err("Can't inject memfd args (pid: %d)\n", pid);
		return -1;
	}

	ret = compel_syscall(ctl, __NR(memfd_create, compat_task), &sret,
			     (unsigned long)where, 0, 0, 0, 0, 0);//获取pid的寄存器信息,且寄存器信息ax累加器内容为sret,sret就是相当于创建的被感染进程的内存文件描述符,通过在被感染进程执行memfd_create创建,where为memfd_create(const char *name, unsigned int flags);中的name参数,故创建的内存文件描述符名称为"CRIUMFD"

	if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) {
	//将本地进程内存地址为orig_code写入到被感染进程内存地址为where中地址中去,写入大小为sizeof(orig_code),由于之前已经互换过一次,所以这一次相当于恢复来where的原始数据,恢复内存文件描述符参数
		fd = (int)(long)sret;
		if (fd >= 0)
			compel_syscall(ctl, __NR(close, compat_task), &sret,
					fd, 0, 0, 0, 0, 0);//关闭被感染进程的fd指定文件
		pr_err("Can't restore memfd args (pid: %d)\n", pid);
		return -1;
	}

	if (ret < 0)
		return ret;

	fd = (int)(long)sret;
	if (fd == -ENOSYS)
		return 1;
	if (fd < 0) {
		errno = -fd;
		pr_perror("Can't create memfd in victim");
		return fd;
	}

	ctl->map_length = round_up(size, page_size());
	lfd = ctl->ictx.open_proc(ctl->rpid, O_RDWR, "fd/%d", fd);//打开被感染进程下指定文件描述符
	if (lfd < 0)
		goto err_cure;

	if (ftruncate(lfd, ctl->map_length) < 0) {
	//将lfd指定的文件截断为ctl->map_length对应大小
		pr_perror("Fail to truncate memfd for parasite");
		goto err_cure;
	}

	ctl->remote_map = remote_mmap(ctl, NULL, size,
				      PROT_READ | PROT_WRITE | PROT_EXEC,
				      MAP_FILE | MAP_SHARED, fd, 0);//远程内存映射,调用compel_syscall,实际上就是调用mmap函数,获取指令指针size,将fd(被感染进程的文件描述)中的内容映射到被感染进程的内存空间。
	if (!ctl->remote_map) {
		pr_err("Can't rmap memfd for parasite blob\n");
		goto err_curef;
	}

	ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
			      MAP_SHARED | MAP_FILE, lfd, 0);
			/*映射lfd文件内容到系统内存(由于start为空,所以系统自行分配内存,)将被感染进程的文件描述符映射到本地内存
			 *void* mmap(void* start,size_t length,int prot,int flags,int fd,off_t offset);
			 *start:映射区的开始地址,设置为0时表示由系统决定映射区的起始地址。
            *length:映射区的长度。//长度单位是 以字节为单位,不足一内存页按一内存页处理
            *prot:期望的内存保护标志,不能与文件的打开模式冲突。是以下的某个值,可以通过or运算合理地组合在一起
            *flags:指定映射对象的类型,映射选项和映射页是否可以共享。它的值可以是一个或者多个以下位的组合体
	        *fd:有效的文件描述词。一般是由open()函数返回,其值也可以设置为-1,此时需要指定flags参数中的MAP_ANON,表明进行的是匿名映射。
            *off_toffset:被映射对象内容的起点
            */
	if (ctl->local_map == MAP_FAILED) {
		ctl->local_map = NULL;
		pr_perror("Can't lmap memfd for parasite blob");
		goto err_curef;
	}
    //关闭指定pid进程中的fd文件
	compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0);
	close(lfd);//关闭本地fd文件

	pr_info("Set up parasite blob using memfd\n");
	return 0;

err_curef:
	close(lfd);
err_cure:
	compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0);
	return -1;
}

/*
*交换指定pid进程的src内存地址和dst内存地址部分的数据,交换数据大小为bytes
*
*/
int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes)
{
	void *t = alloca(bytes);

	if (ptrace_peek_area(pid, t, dst, bytes))//将内存中地址为dst的内容逐字节读取到t所指向的内存地址中去,留作备份
		return -1;

	if (ptrace_poke_area(pid, src, dst, bytes)) {
	//将内存地址为src中的内容写入到dst所指向的内存地址中去
		if (ptrace_poke_area(pid, t, dst, bytes))
		//如果写入失败则恢复原dst中的内容
			return -2;
		return -1;
	}

	memcpy(src, t, bytes);//将t中备份的dst中的原数据拷贝到内存地址为src的地方,完成src与dst的数据交换

	return 0;
}

/*
*该函数相当于是本地进程实现对指定被感染pid进程的系统调用操作,返回系统调用成功次数ret
*先初始化一个寄存器结构体,然后通过调用compel_execute_syscall来获取指定pid的寄存器信息
*并将信息填充到寄存器结构体,然后返回寄存器的累加器的值,即为成功系统调用的次数
*/
int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret,
		unsigned long arg1,
		unsigned long arg2,
		unsigned long arg3,
		unsigned long arg4,
		unsigned long arg5,
		unsigned long arg6)
{
	user_regs_struct_t regs = ctl->orig.regs;
	int err;

	if (user_regs_native(&regs)) {
	//64位机器,将参数args放到各指定寄存器。
		user_regs_struct64 *r = &regs.native;

		r->ax  = (uint64_t)nr;
		r->di  = arg1;
		r->si  = arg2;
		r->dx  = arg3;
		r->r10 = arg4;
		r->r8  = arg5;
		r->r9  = arg6;

		err = compel_execute_syscall(ctl, &regs, code_syscall);//获取进程的寄存器信息,填充到regs结构体
	} else {
		user_regs_struct32 *r = &regs.compat;

		r->ax  = (uint32_t)nr;
		r->bx  = arg1;
		r->cx  = arg2;
		r->dx  = arg3;
		r->si  = arg4;
		r->di  = arg5;
		r->bp  = arg6;

		err = compel_execute_syscall(ctl, &regs, code_int_80);
	}

	*ret = get_user_reg(&regs, ax);//获取寄存器ax的值,累加器
	return err;
}

/*
*将感染的入口地址ictx.syscall_ip的内容改写为code_syscall+MEMFD_FNAME(即CRIUMFD),并设置pid的寄存器
*地址为该入口地址,然后重启该pid进程,并且屏蔽所有信号,然后wait4等待该pid进程返回状态信息,最后获取该pid的信号
*信息和寄存器信息
*/
int compel_execute_syscall(struct parasite_ctl *ctl,
		user_regs_struct_t *regs, const char *code_syscall)
{
	pid_t pid = ctl->rpid;
	int err;
	uint8_t code_orig[BUILTIN_SYSCALL_SIZE];

	/*
	 * Inject syscall instruction and remember original code,
	 * we will need it to restore original program content.
	 */
	memcpy(code_orig, code_syscall, sizeof(code_orig));//将code_syscall中的内容复制到code_orig原始代码地址中去
	if (ptrace_swap_area(pid, (void *)ctl->ictx.syscall_ip,
			     (void *)code_orig, sizeof(code_orig))) {
			     //交换ctl->ictx.syscall_ip感染的入口和code_orig中的内存内容完成系统调用原子注入,这里code_orig大小正好为BUILTIN_SYSCALL_SIZE,故正好和之前的一次内存交换完成拼接,也就是相当于将code_syscall放到ctl->ictx.syscall_ip的前缀位置,拼接后的结果为code_syscall+MEMFD_FNAME(即CRIUMFD),而code_orig存放的 为ictx.syscall_ip原始数据
		pr_err("Can't inject syscall blob (pid: %d)\n", pid);
		return -1;
	}

	err = parasite_run(pid, PTRACE_CONT, ctl->ictx.syscall_ip, 0, regs, &ctl->orig);//重启被跟踪的进程,并且屏蔽掉所有的信号
	if (!err)
	//重启成功
		err = parasite_trap(ctl, pid, regs, &ctl->orig);//等待进程返回状态,若出现异常根据ctl->orig恢复线程上下文,若返回状态正常,则获取该进程的信号信息和寄存器信息

	if (ptrace_poke_area(pid, (void *)code_orig,
			     (void *)ctl->ictx.syscall_ip, sizeof(code_orig))) {
		//code_orig存放的 为ictx.syscall_ip原始数据,将code_orig中的内容写入到ictx.syscall_ip位置。恢复syscall blob
		pr_err("Can't restore syscall blob (pid: %d)\n", ctl->rpid);
		err = -1;
	}

	return err;
}

/*
*重启被跟踪的进程
*并且屏蔽掉所有发送给该进程的信号,同时将寄存器内数据初始化
*/

static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack,
		user_regs_struct_t *regs, struct thread_ctx *octx)
{
	k_rtsigset_t block;

	ksigfillset(&block);//设置信号掩码屏蔽所有的信号
	if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) {
	//将block信号设置到指定pid的信号掩码中去,来为pid屏蔽所有信号
		pr_perror("Can't block signals for %d", pid);
		goto err_sig;
	}

	parasite_setup_regs(ip, stack, regs);//stack为0,初始化regs寄存器,且寄存器的指令指针地址为ip,也就是ictx.syscall_ip,感染的入口地址
	if (ptrace_set_regs(pid, regs)) {
	//调用ptrace(PTRACE_SETREGSET),将regs设置为指定pid的寄存器
		pr_perror("Can't set registers for %d", pid);
		goto err_regs;
	}

	if (ptrace(cmd, pid, NULL, NULL)) {
	//这里的cmd为PTRACE_CONT,重启被跟踪的进程,data为NULL表示所有发送给被跟踪的进程都不会被传输,屏蔽所有信号
		pr_perror("Can't run parasite at %d", pid);
		goto err_cont;
	}

	return 0;

err_cont:
	if (ptrace_set_regs(pid, &octx->regs))
		pr_perror("Can't restore regs for %d", pid);
err_regs:
	if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask))
		pr_perror("Can't restore sigmask for %d", pid);
err_sig:
	return -1;
}

/* we run at @regs->ip */
static int parasite_trap(struct parasite_ctl *ctl, pid_t pid,
				user_regs_struct_t *regs,
				struct thread_ctx *octx)
{
	siginfo_t siginfo;
	int status;
	int ret = -1;

	/*
	 * Most ideas are taken from Tejun Heo's parasite thread
	 * https://code.google.com/p/ptrace-parasite/
	 */

	if (wait4(pid, &status, __WALL, NULL) != pid) {
	  //等待pid进程返回状态
		pr_perror("Waited pid mismatch (pid: %d)", pid);
		goto err;
	}

	if (!WIFSTOPPED(status)) {
	//分析指定进程返回状态是否停止运行
		pr_err("Task is still running (pid: %d)\n", pid);
		goto err;
	}

	if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
	//获取指定pid的信号信息
		pr_perror("Can't get siginfo (pid: %d)", pid);
		goto err;
	}

	if (ptrace_get_regs(pid, regs)) {
	//获取指定pid的寄存器信息
		pr_perror("Can't obtain registers (pid: %d)", pid);
			goto err;
	}

	if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) {
		pr_debug("** delivering signal %d si_code=%d\n",
			 siginfo.si_signo, siginfo.si_code);

		pr_err("Unexpected %d task interruption, aborting\n", pid);
		goto err;
	}

	/*
	 * We've reached this point if int3 is triggered inside our
	 * parasite code. So we're done.
	 */
	ret = 0;
err:
	if (restore_thread_ctx(pid, octx))
		ret = -1;

	return ret;
}

第三部分:寄生进程启动后台服务模式,先绑定sock并对其监听,然后重启进程实现在子进程handler中处理寄生进程的异常,等待从指定sock接受到的消息

static int parasite_start_daemon(struct parasite_ctl *ctl)
{
	pid_t pid = ctl->rpid;
	struct infect_ctx *ictx = &ctl->ictx;

	/*
	 * Get task registers before going daemon, since the
	 * compel_get_task_regs needs to call ptrace on _stopped_ task,
	 * while in daemon it is not such.
	 */

	if (get_task_regs(pid, &ctl->orig.regs, ictx->save_regs, ictx->regs_arg)) {
		//获取指定进程的寄存器信息
		pr_err("Can't obtain regs for thread %d\n", pid);
		return -1;
	}

	if (ictx->make_sigframe(ictx->regs_arg, ctl->sigframe, ctl->rsigframe, &ctl->orig.sigmask))
		//构建信号帧
		return -1;

	if (parasite_init_daemon(ctl))
		return -1;

	return 0;
}

static int parasite_init_daemon(struct parasite_ctl *ctl)
{
	struct parasite_init_args *args;
	pid_t pid = ctl->rpid;
	user_regs_struct_t regs;
	struct ctl_msg m = { };

	*ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON;

	args = compel_parasite_args(ctl, struct parasite_init_args);//获取ctl->addr_args参数

	args->sigframe = (uintptr_t)ctl->rsigframe;
	args->log_level = compel_log_get_loglevel();

	futex_set(&args->daemon_connected, 0);//futex(fast userspace mutex)用于在用户空间实现锁操作

	if (prepare_tsock(ctl, pid, args))
	//根据指定pid的socket 信息来获取socket名称并实现socket绑定和监听
		goto err;

	/* after this we can catch parasite errors in chld handler */
	if (setup_child_handler(ctl))
	//首先设置信号处理方式sa,再sigemptyset清空信号集sa.sa_mask,sigaddset添加SIGCHLD信号到该信号集,并sigaction设置SIGCHLD的处理方式为sa,成功设置在子进程handler中处理寄生进程的异常
		goto err;

	regs = ctl->orig.regs;
	if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, &regs, &ctl->orig))
		//重启被感染的pid进程
		goto err;

	futex_wait_while_eq(&args->daemon_connected, 0);//等待直到daemon_connected变量的锁值为0
	if (futex_get(&args->daemon_connected) != 1) {
	//获得锁
		errno = -(int)futex_get(&args->daemon_connected);
		pr_perror("Unable to connect a transport socket");
		goto err;
	}

	if (accept_tsock(ctl) < 0)
	//接受套接字sock收到的数据
		goto err;

	if (compel_util_send_fd(ctl, ctl->ictx.log_fd))
	   //首先获取套接字对象sock,再向该套接字发送文件描述符
		goto err;

	pr_info("Wait for parasite being daemonized...\n");
	//等待寄生进程后台模式化

	if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m))
	 {
	 //进程等待指定socket接受到的消息
		pr_err("Can't switch parasite %d to daemon mode %d\n",
		       pid, m.err);
		goto err;
	}

	ctl->sigreturn_addr = (void*)(uintptr_t)args->sigreturn_addr;
	ctl->daemonized = true;
	pr_info("Parasite %d has been switched to daemon mode\n", pid);
	//寄生进程已经被后台模式化
	return 0;
err:
	return -1;
}