"criu dump重要步骤的深入研究七"

  "criu checkpoint十"

Posted by Xu on June 22, 2017

CRIU,DUMP备份步骤解析

通过研究dump_one_task部分源码,我们已经了解了如何感染进程,同时通过控制并监视感染进程获取我们所需要的信息,上一小节研究到感染进程文件的的备份实现。这一小节将沿着文件备份之后的过程展开分析!

延续dump_one_task未分析源码部分:该部分只对内存页面的备份进行分析,这也是热迁移过程中最需要关注的一部分,虽然只有一行代码,当备份过程相当复杂,主要分为5个阶段:

  • 1.准备阶段:页面缓存的初始化,管道的建立,传输单元结构的创建等
  • 2.内存记录:将所有的虚拟内存块存与管道对象中的ppb->iov管道页面缓存区单元相对应,并且在pp->iovs中统一管理内存页面包括脏页面holes和普通页面pages
  • 3.管道发送:将缓冲区管理器pp->iovs中的内存页面分别对应写入每一个缓冲区ppb->p[1]管道的写端发送。
  • 4.将所有的内存页面打包成镜像
  • 5.重置脏页面bit,通过向clear_refs文件写入4重置dirty bit

代码逻辑如下:

memory dump

ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
	if (ret)
		goto err_cure;

parasite_dump_pages_seized()

int parasite_dump_pages_seized(struct pstree_item *item,
		struct vm_area_list *vma_area_list,
		struct mem_dump_ctl *mdc,
		struct parasite_ctl *ctl)
{
	int ret;
	struct parasite_dump_pages_args *pargs;//寄生内存备份参数

	pargs = prep_dump_pages_args(ctl, vma_area_list, mdc->pre_dump);//定义寄生虚拟内存入口p_vma为ctl->addr_args+1(也就是相当于虚拟内存的入口在args结构体后面),根据传入的vmas的虚拟内存区域(判断为私有,共享或异步读取环内存区域)迭代设置从该入口进入的虚拟内存区域p_vma,然后设置ctl->addr_args->nr_vmas虚拟内存区域个数并返回

	/*
	 * Add PROT_READ protection for all VMAs we're about to
	 * dump if they don't have one. Otherwise we'll not be
	 * able to read the memory contents.
	 *
	 * Afterwards -- reprotect memory back.
	 */

	pargs->add_prot = PROT_READ;//加入PROT_READ可以读取内存内容
	ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl);//通发送命令信息,使感染进程调用mprotect_vmas(args);该函数的功能就是用来修改感染进程的虚拟存储区域的读写属性为PROT_READ
	if (ret) {
		pr_err("Can't dump unprotect vmas with parasite\n");
		return ret;
	}

	if (fault_injected(FI_DUMP_PAGES)) {
	    //返回restorer.c中的枚举变量fi_strategy是否等于FI_DUMP_PAGES
		pr_err("fault: Dump VMA pages failure!\n");
		return -1;
	}

	ret = __parasite_dump_pages_seized(item, pargs, vma_area_list, mdc, ctl);
	if (ret) {
		pr_err("Can't dump page with parasite\n");
		/* Parasite will unprotect VMAs after fail in fini() */
		return ret;
	}

	pargs->add_prot = 0;
	if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
		pr_err("Can't rollback unprotected vmas with parasite\n");
		ret = -1;
	}

	return ret;
}

__parasite_dump_pages_seized()备份寄生进程内存数据

static int __parasite_dump_pages_seized(struct pstree_item *item,
		struct parasite_dump_pages_args *args,
		struct vm_area_list *vma_area_list,
		struct mem_dump_ctl *mdc,
		struct parasite_ctl *ctl)
{
	pmc_t pmc = PMC_INIT;//pmc:pagemap-cache内存映射缓存初始化
	struct page_pipe *pp;//内存管道结构体信息
	struct vma_area *vma_area;//虚拟内存区域
	struct page_xfer xfer = { .parent = NULL };//用于传输内存到镜像文件,可实现两个镜像文件union:本地镜像文件和page-server镜像文件
	int ret = -1;
	unsigned cpp_flags = 0;
	unsigned long pmc_size;

	pr_info("\n");
	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real);
	pr_info("----------------------------------------\n");

	timing_start(TIME_MEMDUMP);//开始内存备份计时

	pr_debug("   Private vmas %lu/%lu pages\n",
			vma_area_list->priv_longest, vma_area_list->priv_size);

	/*
	 * Step 0 -- prepare
	 */

	pmc_size = max(vma_area_list->priv_longest,
		vma_area_list->shared_longest);
	if (pmc_init(&pmc, item->pid->real, &vma_area_list->h,
			 pmc_size * PAGE_SIZE))//初始化内存页面缓存,进行一系列的设置
		return -1;

	ret = -1;
	if (!mdc->pre_dump)
		/*
		 * Chunk mode pushes pages portion by portion. This mode
		 * only works when we don't need to keep pp for later
		 * use, i.e. on non-lazy non-predump.
		 */
		cpp_flags |= PP_CHUNK_MODE;
	pp = create_page_pipe(vma_area_list->priv_size,
					    pargs_iovs(args), cpp_flags);//创建内存页面管道单元,相当于pp初始化,且pp->iovs数组的起始地址为args+1+a->nr_vmas之后保存,同时初始化pp->iovs与ppb->iov起始地址相同
	if (!pp)
		goto out;

	if (!mdc->pre_dump) {
		/*
		 * Regular dump -- create xfer object and send pages to it
		 * right here. For pre-dumps the pp will be taken by the
		 * caller and handled later.
		 */
		ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, vpid(item));//常规备份,根据选项是否使用page-server,设置xfer内存传输单元
		if (ret < 0)
			goto out_pp;
	} else {
		ret = check_parent_page_xfer(CR_FD_PAGEMAP, vpid(item));
		if (ret < 0)
			goto out_pp;

		if (ret)
			xfer.parent = NULL + 1;
	}

	/*
	 * Step 1 -- generate the pagemap
	 */
	args->off = 0;
	list_for_each_entry(vma_area, &vma_area_list->h, list) {
		bool has_parent = !!xfer.parent;
		u64 off = 0;
		u64 *map;

		if (!vma_area_is_private(vma_area, kdat.task_size) &&
				!vma_area_is(vma_area, VMA_ANON_SHARED))
			continue;
		if (vma_entry_is(vma_area->e, VMA_AREA_AIORING)) {
			if (mdc->pre_dump)
				continue;
			has_parent = false;
		}
//判断该虚拟存储链表是否为私有的,VMA_ANON_SHARED共享的,VMA_AREA_AIORING异步读取环
		map = pmc_get_map(&pmc, vma_area);//返回该虚拟内存区域所在页号映射的实际地址,map为该虚拟内存页号与实际地址之间的映射关系,存放的是该虚拟页面在pagemap中对应的64位值
		if (!map)
			goto out_xfer;
		if (vma_area_is(vma_area, VMA_ANON_SHARED))
			ret = add_shmem_area(item->pid->real, vma_area->e, map);//如果是共享内存数据,添加到共享内存信息映射shmem_map中去,并进行数据更新。
		else {
again:
			ret = generate_iovs(vma_area, pp, map, &off,
				has_parent);//将虚拟内存区域的内存页面(有hole和pages的区分)添加到管道中去,以iovs的向量形式保存
			if (ret == -EAGAIN) {
				BUG_ON(!(pp->flags & PP_CHUNK_MODE));

				ret = drain_pages(pp, ctl, args);
				if (!ret)
					ret = xfer_pages(pp, &xfer);
				if (!ret) {
					page_pipe_reinit(pp);
					goto again;
				}
			}
		}
		if (ret < 0)
			goto out_xfer;
	}

	ret = drain_pages(pp, ctl, args);//,第二个步骤:将所有记录在pp->iovs的内存页面,通过管道pp发送
	if (!ret && !mdc->pre_dump)
	//ret==0表示成功映射到管道,进行第三个步骤,将内存页面写入镜像
		ret = xfer_pages(pp, &xfer);
	if (ret)
		goto out_xfer;

	timing_stop(TIME_MEMDUMP);

	/*
	 * Step 4 -- clean up
	 */

	ret = task_reset_dirty_track(item->pid->real);
out_xfer:
	if (!mdc->pre_dump)
		xfer.close(&xfer);
out_pp:
	if (ret || !mdc->pre_dump)
		destroy_page_pipe(pp);
	else
		dmpi(item)->mem_pp = pp;
out:
	pmc_fini(&pmc);
	pr_info("----------------------------------------\n");
	return ret;
}

1.准备阶段:打开内存传输服务器单元xfer

//打开内存页面传输器,向page-server-socket写入数据看是否成功返回响应。
static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id)
{
	char has_parent;

	xfer->sk = page_server_sk;
	xfer->write_pagemap = write_pagemap_to_server;
	xfer->write_pages = write_pages_to_server;
	xfer->write_hole = write_hole_to_server;
	xfer->close = close_server_xfer;
	xfer->dst_id = encode_pm_id(fd_type, id);
	xfer->parent = NULL;
//对内存传输控制器进行一系列设置
	if (send_psi(xfer->sk, PS_IOV_OPEN2, 0, 0, xfer->dst_id)) {
	//向传输目的发送page_server_iov I/O容器,调用write,向page_server_sk(socket)写入page_server_iov,实现内存页面传输
		pr_perror("Can't write to page server");
		return -1;
	}

	/* Push the command NOW */
	tcp_nodelay(xfer->sk, true);//设置套接字选项setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val));
	/*
	*int setsockopt(int sock, int level, int optname, const void *optval, socklen_t optlen);
    参数:  
    *sock:将要被设置或者获取选项的套接字。
    *level:选项所在的协议层。
    *optname:需要访问的选项名。
    *optval:对于setsockopt(),指向包含新选项值的缓冲。
    *optlen:对于setsockopt(),现选项的长度。
    */

	if (read(xfer->sk, &has_parent, 1) != 1) {
	//读取page-server套接字返回的消息
		pr_perror("The page server doesn't answer");
		return -1;
	}

	if (has_parent)
		xfer->parent = (void *) 1; /* This is required for generate_iovs() */

	return 0;
}

获取pagemap内存映射

u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma)
{
	/* Hit */
	if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end))
	//第一次pmc->start,pmc->end为空未命中,命中后根据pmc->map(即/proc/pid/pagemap)中的映射关系得到对应地址并返回。
		return __pmc_get_map(pmc, vma->e->start);

	/* Miss, refill the cache */
	if (pmc_fill_cache(pmc, vma)) {
	//第一次进入该函数会未命中,见后面详解
		pr_err("Failed to fill cache for %d (%lx-%lx)\n",
		       pmc->pid, (long)vma->e->start, (long)vma->e->end);
		return NULL;
	}

	/* Hit for sure */
	return __pmc_get_map(pmc, vma->e->start);
}
//重填缓存,设置好了pmc->start为最低地址和pmc->end为最高地址,并将pmc->fd中的内容填写到pmc->map中去
static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma)
{
	unsigned long low = vma->e->start & PMC_MASK;//PMC_MASK为(2M,10.(21个0)..0),相当于地址末尾21位全部清0。为起始最低地址
	unsigned long high = low + PMC_SIZE;//PMC_SIZE为(2M,1uL<<21),最高地址为,最低地址+PMC_SIZE(2M的任务大小)
	size_t len = vma_area_len(vma);//vma->end - vma->start
	size_t size_map;

	if (high > kdat.task_size)
		high = kdat.task_size;//kdat.task_size为内核对单个任务的大小限制

	pmc->start = vma->e->start;//设置pmc->start和pmc->end
	pmc->end = vma->e->end;

	pr_debug("filling VMA %lx-%lx (%zuK) [l:%lx h:%lx]\n",
		 (long)vma->e->start, (long)vma->e->end, len >> 10, low, high);

	/*
	 * If we meet a small VMA, lets try to fit 2M cache
	 * window at least 75% full, otherwise left as a plain
	 * "one vma at a time" read. Note the VMAs in cache must
	 * fit in solid manner, iow -- either the whole vma fits
	 * the cache window, either plain read is used.
	 *
	 * The benefit (apart redusing the number of read() calls)
	 * is to walk page tables less.
	 */
	if (!pagemap_cache_disabled &&
            len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) {
            //PMC_SIZE_GAP为2m/4=0.5Mbit
		size_t size_cov = len;
		size_t nr_vmas = 1;

		pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
			 (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov);

		list_for_each_entry_continue(vma, pmc->vma_head, list) {
		//同样是遍历vma_area_list链表中所有的虚拟内存块
			if (vma->e->start > high || vma->e->end > high)
				break;

			BUG_ON(vma->e->start < low);
			size_cov += vma_area_len(vma);//遍历一个加一个虚拟存储区域块大小
			nr_vmas++;//记录虚拟内存块个数

			pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
				 (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov);
		}

		if (nr_vmas > 1) {
			/*
			 * Note we don't touch low bound since it's set
			 * to first VMA start already and not updating it
			 * allows us to save a couple of code bytes.
			 */
			pmc->end = high;//如果有多于一个虚拟内存块,则将pmc->end设置为最高可能的地址
			pr_debug("\tcache  mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
		} else
			pr_debug("\tsimple mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
	}

	size_map = PAGEMAP_LEN(pmc->end - pmc->start);
	BUG_ON(pmc->map_len < size_map);
	BUG_ON(pmc->fd < 0);

	if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
	//将pmc->fd指向的文件中的数据读取到pmc->map中去,读取内容大小为pmc的大小,偏移量为PAGEMAP_PFN_OFF(pmc->start):先计算pmc->start页帧号(地址/页大小(这里是4kbit))然后与u64(8字节)相乘得到偏移量,此处的pmc->fd为打开/proc/pid/pagemap得到的文件描述符
		pmc_zap(pmc);
		pr_perror("Can't read %d's pagemap file", pmc->pid);
		return -1;
	}

	return 0;
}

共享存储区域的存储

int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
{
	struct shmem_info *si;
	unsigned long size = vma->pgoff + (vma->end - vma->start);//vma->pgoff是指该虚拟内存起始地址在虚拟内存文件中的偏移量

	if (vma_entry_is(vma, VMA_AREA_SYSVIPC))//判断是否为SYSV IPC通信机制
		pid = SYSVIPC_SHMEM_PID;

	si = shmem_find(vma->shmid);//根据之前收集的共享内存信息shmems_hash来获取指定shmid的内存链表头,遍历该链表获取该shmid对应的链表节点,返回该共享内存区域信息结构体shmem_info si
	if (si) {
		if (si->size < size) {
			if (expand_shmem(si, size))//把该共享内存区域扩展到size大小,且扩展的部分全部置为0.
				return -1;
		}
		update_shmem_pmaps(si, map, vma);//更新共享内存信息,设置diryt重写标志位,见下面详解

		return 0;
	}
//这一部分就是没有找到对应该shmid的共享内存信息,进行重新分配设置并添加到共享内存映射结构中去,然后进行更新
	si = xzalloc(sizeof(*si));//再分配一个*si大小内存空间
	if (!si)
		return -1;

	si->pid = pid;
	si->start = vma->start;
	si->end = vma->end;
	si->shmid = vma->shmid;
	shmem_hash_add(si);

	if (expand_shmem(si, size))
		return -1;
	update_shmem_pmaps(si, map, vma);//更新共享内存信息,设置diryt重写标志位,见下面详解


	return 0;
}
//更新共享内存信息,设置diryt重写标志位
static void update_shmem_pmaps(struct shmem_info *si, u64 *map, VmaEntry *vma)
{
	unsigned long shmem_pfn, vma_pfn, vma_pgcnt;

	if (!is_shmem_tracking_en())
		return;

	vma_pgcnt = DIV_ROUND_UP(si->size - vma->pgoff, PAGE_SIZE);
	//获取虚拟内存区域一共有多少页vma_pgcnt,这个是从该虚拟内存区域开始计算页号
	for (vma_pfn = 0; vma_pfn < vma_pgcnt; ++vma_pfn) {
		if (!should_dump_page(vma, map[vma_pfn]))
			continue;

		shmem_pfn = vma_pfn + DIV_ROUND_UP(vma->pgoff, PAGE_SIZE);//共享内存页数为虚拟内存页数加上虚拟内存偏移量所占的页数,也就是从该虚拟文件开始计算页号。
		if (map[vma_pfn] & PME_SOFT_DIRTY)//查看该页号对应的物理地址的重写标志位是否被标记
			set_pstate(si->pstate_map, shmem_pfn, PST_DIRTY);//PST_DIRTY为1
		else
			set_pstate(si->pstate_map, shmem_pfn, PST_DUMP);//PST_DUMP为3
	}
}

static void set_pstate(unsigned long *pstate_map, unsigned long pfn,
		unsigned int pstate)
{

	if (pstate & 1)
	//说明为PST_DIRTY,
		set_bit(PST_BIT0_IX(pfn), pstate_map);//#define PST_BIT0_IX(pfn) ((pfn) * PST_BITS),即为pfn*2,将pstate_map中的第pfn*2位设为1
	if (pstate & 2)
		set_bit(PST_BIT1_IX(pfn), pstate_map);//将pstate_map中的第pfn*2+1位设为1
}

/*
*Static __inline__ void __set_bit(int nr, volatile void * addr)
  *{
  *        __asm__(
  *                       "btsl %1,%0"
  *                       :"=m" (ADDR)
  *                       :"Ir" (nr));
  *}
  *后 例功能是将(*addr)的第nr位设为1。第一个占位符%0与C 语言变量ADDR对应,第二个占位
  *符%1与C语言变量nr对应。因此上面的汇编语句代码与下面的伪代码等价:btsl nr, ADDR,
   *该指令的两个操作数不能全是内存变量,因此将nr的限定字符串指定为"Ir",将nr 与立即数或
    *者寄存器相关联,这样两个操作数中只有ADDR为内存变量
*/

2.内存记录阶段:生成io向量,向管道添加holes(脏页面)或pages,以iovs的形式保存

static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent)
{
//这里off等于0,has_parent根据该虚拟内存区域是否为异步I/O环确定,一般的为true,map为虚拟内存映射关系,pp为只做初始化的内存管道
	u64 *at = &map[PAGE_PFN(*off)];//获取0页号对应的物理地址,at指向该映射map[0]地址,at相当于map,获取该虚拟页在pagemap中对应的值
	unsigned long pfn, nr_to_scan;
	unsigned long pages[2] = {};

	nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;//需要浏览的数据页数

	for (pfn = 0; pfn < nr_to_scan; pfn++) {
		unsigned long vaddr;
		int ret;

		if (!should_dump_page(vma->e, at[pfn]))//检查是否需要备份
			continue;

		vaddr = vma->e->start + *off + pfn * PAGE_SIZE;//计算这一页的虚拟地址,off为0

		/*
		 * If we're doing incremental dump (parent images
		 * specified) and page is not soft-dirty -- we dump
		 * hole and expect the parent images to contain this
		 * page. The latter would be checked in page-xfer.
		 */

		if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) {
		//page_in_parent函数返回是否追踪内存&&是否opts.img_parent&&是否at[pfn]为脏页面,has_parent如果不为异步io环均为true,所以hole其实就是代表脏页面
			ret = page_pipe_add_hole(pp, vaddr);//添加holes如果是同一内存区域则直接添加,否则添加一个新的iovec对象存放该内存块,pp->holes
			pages[0]++;
		} else {
			ret = page_pipe_add_page(pp, vaddr);//添加一个虚拟内存页面到管道中去,同上
			pages[1]++;
		}

		if (ret) {
			*off += pfn * PAGE_SIZE;
			return ret;
		}
	}

	*off += pfn * PAGE_SIZE;

	cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
	cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
	cnt_add(CNT_PAGES_WRITTEN, pages[1]);

	pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);//pages[0]为添加的hole个数,pages[1]为添加的pages页数
	return 0;
}

//将内存管道的hole的io向量的起始地址添加至该页的虚拟地址
int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
{
//传进来的是内存通道和这一页的虚拟地址
	if (pp->free_hole >= pp->nr_holes) {//如果正在使用的hole大于已经分配内存的hole个数
		pp->holes = xrealloc(pp->holes,
				(pp->nr_holes + PP_HOLES_BATCH) * sizeof(struct iovec));
				//则重新分配pp->holes(iovec IO向量)的大小,增加一批holes(32个),
		if (!pp->holes)
			return -1;

		pp->nr_holes += PP_HOLES_BATCH;//PP_HOLES_BATCH为32个
	}

	if (pp->free_hole &&
			iov_grow_page(&pp->holes[pp->free_hole - 1], addr))//正在使用的且为最后一个hole的io向量的起始地址是否等于该页虚拟地址,等于说明属于同一内存块直接添加,不等于说明是两个不同内存块需要添加一个iovec对象
		goto out;

	iov_init(&pp->holes[pp->free_hole++], addr);//不等于则将添加一个hole的起始
地址为该页虚拟地址
out:
	return 0;
}


int page_pipe_add_page(struct page_pipe *pp, unsigned long addr)
{
	int ret;

	ret = try_add_page(pp, addr);//先根据pp->buf得到管道内存页面缓冲区ppb,再判断该缓冲区的页数是否超过它的容量,超过则扩大两倍,并重新分配缓冲区大小,然后看添加的内存页面是否和最后添加进去的内存页面相连,若是则属于同一虚拟内存区域,扩大最后一次添加的iovec的长度即可,若不相连,则重新构造一个iovec添加到ppb->iov数组中去
	if (ret <= 0)
	//这个表示内存页面成功添加添加到相应的缓冲区
		return ret;

	ret = page_pipe_grow(pp);//内存页面添加失败,因为当缓存区增加到一定大小时不能继续增加,需要新建一个缓冲区来存放内存页面,所以添加一个管道ppb,并对ppb进行初始化,且ppb->iov[0]=pp->iovs[pp->free_iov],将不同buf存储iov的地址相连统一在pp->iovs中管理
	if (ret < 0)
		return ret;

	ret = try_add_page(pp, addr);//添加管道成功后,继续向缓冲区进行内存页面的添加。
	BUG_ON(ret > 0);
	return ret;
}

3.管道发送阶段:将内存页面放到管道中去

static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
		      struct parasite_dump_pages_args *args)
{
	struct page_pipe_buf *ppb;//定义管道内存页面缓冲区
	int ret = 0;

	debug_show_page_pipe(pp);//将管道中的信息显示出来

	/* Step 2 -- grab pages into page-pipe */
	list_for_each_entry(ppb, &pp->bufs, l) {
		args->nr_segs = ppb->nr_segs;
		args->nr_pages = ppb->pages_in;
		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
				args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);

		ret = compel_rpc_call(PARASITE_CMD_DUMPPAGES, ctl);//调用dump_pages对内存页面数据进行备份,主要是sys_vmsplice的使用
		if (ret < 0)
			return -1;
		ret = compel_util_send_fd(ctl, ppb->p[1]);//发送每一个ppb的写端口文件描述符
		if (ret)
			return -1;

		ret = compel_rpc_sync(PARASITE_CMD_DUMPPAGES, ctl);
		if (ret < 0)
			return -1;

		args->off += args->nr_segs;
	}

	return 0;
}

static int dump_pages(struct parasite_dump_pages_args *args)
{
	int p, ret, tsock;
	struct iovec *iovs;

	tsock = parasite_get_rpc_sock();//获取tsock套接字对象
	p = recv_fd(tsock);//从tsock接收消息,并将消息存放在cmsghdr对象中,返回cmsghdr所指向的文件描述符对象
	if (p < 0)
		return -1;

	iovs = pargs_iovs(args);//返回args+1+a->nr_vmas(虚拟内存入口地址+所有虚拟内存的偏移量)得到的地址,而该地址正是在管道创建之初就设定好的pp->iovs的启示地址,根据iovs[]数组可得到所有需要传输的内存地址向量。
	ret = sys_vmsplice(p, &iovs[args->off], args->nr_segs,
				SPLICE_F_GIFT | SPLICE_F_NONBLOCK);//sys_vmsplice是将用户空间的内存映射到管道,映射的内容为iovs io向量,映射的个数正是args->nr_segs,这里的p为ppb->p[1]
	if (ret != PAGE_SIZE * args->nr_pages) {
		sys_close(p);
		pr_err("Can't splice pages to pipe (%d/%d)\n", ret, args->nr_pages);
		return -1;
	}

	sys_close(p);
	return 0;
}

4.写入镜像阶段:将内存页面写入镜像中

static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
{
	int ret;

	/*
	 * Step 3 -- write pages into image (or delay writing for
	 *           pre-dump action (see pre_dump_one_task)
	 */
	timing_start(TIME_MEMWRITE);
	ret = page_xfer_dump_pages(xfer, pp, 0);
	timing_stop(TIME_MEMWRITE);

	return ret;
}

int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
		unsigned long off)
{
//off等于0
	struct page_pipe_buf *ppb;
	unsigned int cur_hole = 0;
	int ret;

	pr_debug("Transferring pages:\n");

	list_for_each_entry(ppb, &pp->bufs, l) {
	//遍历所有的管道内存缓冲块
		unsigned int i;

		pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);

		for (i = 0; i < ppb->nr_segs; i++) {
		//从ppb->iov[]遍历该缓冲块上每一个内存页面
			struct iovec iov = ppb->iov[i];

			ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base, off);
			//将hole向量对象通过xfer工具写入镜像
			if (ret)
				return ret;

			BUG_ON(iov.iov_base < (void *)off);
			iov.iov_base -= off;
			pr_debug("\tp %p [%u]\n", iov.iov_base,
					(unsigned int)(iov.iov_len / PAGE_SIZE));

			if (xfer->write_pagemap(xfer, &iov))//page-server通过socket发送内存向量,本地则是pb_write_one写入镜像
				return -1;
			if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len))//page-server通过socket发送内存向量,本地则是pb_write_one写入镜像
				return -1;
		}
	}

	return dump_holes(xfer, pp, &cur_hole, NULL, off);//备份所有的holes对象信息,没有限制
}

static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
		      unsigned int *cur_hole, void *limit, unsigned long off)
{
	int ret;

	for (; *cur_hole < pp->free_hole ; (*cur_hole)++) {
	//遍历管道中所有正在使用中的holes对象
		struct iovec hole = pp->holes[*cur_hole];

		if (limit && hole.iov_base >= limit)
			break;

		ret = page_xfer_dump_hole(xfer, &hole, off);//将每一个hole经由xfer对象写入镜像
		if (ret)
			return ret;
	}

	return 0;
}
//实现对holes对象的备份
static int page_xfer_dump_hole(struct page_xfer *xfer,
		struct iovec *hole, unsigned long off)
{
	BUG_ON(hole->iov_base < (void *)off);
	hole->iov_base -= off;
	pr_debug("\th %p [%u]\n", hole->iov_base,
			(unsigned int)(hole->iov_len / PAGE_SIZE));

	if (xfer->write_hole(xfer, hole))
	//由于本地保存和通过page-server有所区别,所以write_hole函数也有所区别,本地保存函数为write_pagehole_loc,page-server则是write_hole_to_server
		return -1;

	return 0;
}

//本地保存写holes到镜像函数
static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
{
	PagemapEntry pe = PAGEMAP_ENTRY__INIT;//设置一个内存页面入口对象

	if (xfer->parent != NULL) {
		int ret;

		ret = check_pagehole_in_parent(xfer->parent, iov);
		if (ret) {
			pr_err("Hole %p/%zu not found in parent\n",
					iov->iov_base, iov->iov_len);
			return -1;
		}
	}

	pe.vaddr = encode_pointer(iov->iov_base);//设置内存地址
	pe.nr_pages = iov->iov_len / PAGE_SIZE;//设置内存页数
	pe.has_in_parent = true;
	pe.in_parent = true;

	if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)//写入镜像数据
		return -1;

	return 0;
}
//page-server通过socket发送数据
static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
{
	return send_iov(xfer->sk, PS_IOV_HOLE, xfer->dst_id, iov);
}