Skip to content

Commit 1a5f191

Browse files
cfreeamdfdavid-amd
authored andcommitted
plugin/amdgpu: Support for open dmabuf handles
Add handling for open dmabuf fds. These are file descriptors meant to be imported by a driver to create shared memory. Usually, amdgpu runtimes close these right after importing them, but it's possible that a CRIU process might catch them in the moment when they exist. Signed-off-by: David Francis <David.Francis@amd.com>
1 parent f5a9936 commit 1a5f191

File tree

12 files changed

+304
-42
lines changed

12 files changed

+304
-42
lines changed

criu/files-ext.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,14 @@ struct collect_image_info ext_file_cinfo = {
8787
.collect = collect_one_ext,
8888
};
8989

90-
int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *e)
90+
int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *e, bool force)
9191
{
9292
int ret;
9393

94-
ret = do_dump_gen_file(p, lfd, &ext_dump_ops, e);
94+
ret = do_dump_gen_file(p, lfd, &ext_dump_ops, e, force);
9595
if (ret == 0)
9696
return 0;
9797
if (ret == -ENOTSUP)
9898
pr_err("Can't dump file %d of that type [%o] (%s %s)\n", p->fd, p->stat.st_mode, more, info);
99-
return -1;
99+
return ret;
100100
}

criu/files.c

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,8 @@ uint32_t make_gen_id(uint32_t st_dev, uint32_t st_ino, uint64_t pos)
329329
return st_dev ^ st_ino ^ pos_hi ^ pos_low;
330330
}
331331

332-
int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e)
332+
int do_dump_gen_file(struct fd_parms *p, int lfd,
333+
const struct fdtype_ops *ops, FdinfoEntry *e, bool force)
333334
{
334335
int ret = -1;
335336

@@ -339,7 +340,7 @@ int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops,
339340
e->flags = p->fd_flags;
340341

341342
ret = fd_id_generate(p->pid, e, p);
342-
if (ret == 1) /* new ID generated */
343+
if (ret == 1 || force) /* new ID generated */
343344
ret = ops->dump(lfd, e->id, p);
344345
else
345346
/* Remove locks generated by the fd before going to the next */
@@ -484,19 +485,19 @@ static int dump_chrdev(struct fd_parms *p, int lfd, FdinfoEntry *e)
484485
}
485486

486487
sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev));
487-
err = dump_unsupp_fd(p, lfd, "chr", more, e);
488+
err = dump_unsupp_fd(p, lfd, "chr", more, e, false);
488489
p->link = link_old;
489490
return err;
490491
}
491492
}
492493

493-
err = do_dump_gen_file(p, lfd, ops, e);
494+
err = do_dump_gen_file(p, lfd, ops, e, false);
494495
p->link = link_old;
495496
return err;
496497
}
497498

498499
static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, struct parasite_ctl *ctl,
499-
FdinfoEntry *e, struct parasite_drain_fd *dfds)
500+
FdinfoEntry *e, struct parasite_drain_fd *dfds, bool force)
500501
{
501502
struct fd_parms p = FD_PARMS_INIT;
502503
const struct fdtype_ops *ops;
@@ -552,14 +553,14 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
552553
ops = &bpfmap_dump_ops;
553554
#endif
554555
else
555-
return dump_unsupp_fd(&p, lfd, "anon", link, e);
556+
return dump_unsupp_fd(&p, lfd, "anon", link, e, force);
556557

557-
return do_dump_gen_file(&p, lfd, ops, e);
558+
return do_dump_gen_file(&p, lfd, ops, e, force);
558559
}
559560

560561
if (p.fs_type == PID_FS_MAGIC) {
561562
ops = &pidfd_dump_ops;
562-
return do_dump_gen_file(&p, lfd, ops, e);
563+
return do_dump_gen_file(&p, lfd, ops, e, force);
563564
}
564565

565566
if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || S_ISLNK(p.stat.st_mode)) {
@@ -576,9 +577,9 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
576577
else if (check_ns_proc(&link))
577578
ops = &nsfile_dump_ops;
578579
else
579-
return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e);
580+
return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e, force);
580581

581-
return do_dump_gen_file(&p, lfd, ops, e);
582+
return do_dump_gen_file(&p, lfd, ops, e, force);
582583
}
583584

584585
if (S_ISFIFO(p.stat.st_mode)) {
@@ -587,7 +588,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
587588
else
588589
ops = &fifo_dump_ops;
589590

590-
return do_dump_gen_file(&p, lfd, ops, e);
591+
return do_dump_gen_file(&p, lfd, ops, e, force);
591592
}
592593

593594
/*
@@ -598,7 +599,7 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
598599
if (fill_fdlink(lfd, &p, &link))
599600
memzero(&link, sizeof(link));
600601

601-
return dump_unsupp_fd(&p, lfd, "unknown", link.name + 1, e);
602+
return dump_unsupp_fd(&p, lfd, "unknown", link.name + 1, e, force);
602603
}
603604

604605
int dump_my_file(int lfd, u32 *id, int *type)
@@ -610,7 +611,7 @@ int dump_my_file(int lfd, u32 *id, int *type)
610611
me.real = getpid();
611612
me.ns[0].virt = -1; /* FIXME */
612613

613-
if (dump_one_file(&me, lfd, lfd, &fdo, NULL, &e, NULL))
614+
if (dump_one_file(&me, lfd, lfd, &fdo, NULL, &e, NULL, false))
614615
return -1;
615616

616617
*id = e.id;
@@ -625,6 +626,8 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s
625626
struct fd_opts *opts = NULL;
626627
int i, ret = -1;
627628
int off, nr_fds = min((int)PARASITE_MAX_FDS, dfds->nr_fds);
629+
int *retry_indices = NULL;
630+
int retry_count = 0;
628631

629632
pr_info("\n");
630633
pr_info("Dumping opened files (pid: %d)\n", item->pid->real);
@@ -642,6 +645,10 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s
642645
if (!img)
643646
goto err;
644647

648+
retry_indices = xmalloc(dfds->nr_fds * sizeof(int)); /* Allocate memory for retry indices*/
649+
if (!retry_indices)
650+
goto err;
651+
645652
ret = 0; /* Don't fail if nr_fds == 0 */
646653
for (off = 0; ret == 0 && off < dfds->nr_fds; off += nr_fds) {
647654
if (nr_fds + off > dfds->nr_fds)
@@ -654,9 +661,32 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s
654661
for (i = 0; i < nr_fds; i++) {
655662
FdinfoEntry e = FDINFO_ENTRY__INIT;
656663

657-
ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds);
664+
ret = dump_one_file(item->pid, dfds->fds[i + off], lfds[i], opts + i, ctl, &e, dfds, false);
665+
if (ret == -EAGAIN) {
666+
retry_indices[retry_count++] = i;
667+
ret = 0; //* Reset ret to continue the loop */
668+
continue;
669+
} else if (ret)
670+
break;
671+
672+
ret = pb_write_one(img, &e, PB_FDINFO);
658673
if (ret)
659674
break;
675+
}
676+
/* Dmabuf FDs cannot be dumped unless we have first dumped the device file that exported them.
677+
* For that reason, retry dump of files that failed to dump the first time. Specifying force
678+
* here makes the dump go through even if a file id has already been assigned.
679+
*/
680+
for (i = 0; i < retry_count; i++) {
681+
int idx = retry_indices[i];
682+
FdinfoEntry e = FDINFO_ENTRY__INIT;
683+
684+
ret = dump_one_file(item->pid, dfds->fds[idx + off], lfds[idx],
685+
opts + idx, ctl, &e, dfds, true);
686+
if (ret) {
687+
pr_err("Retry failed for fd index %d\n", idx);
688+
break;
689+
}
660690

661691
ret = pb_write_one(img, &e, PB_FDINFO);
662692
if (ret)
@@ -673,6 +703,8 @@ int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, s
673703
close_image(img);
674704
xfree(opts);
675705
xfree(lfds);
706+
xfree(retry_indices);
707+
676708
return ret;
677709
}
678710

criu/include/files.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ struct fdtype_ops {
138138
struct cr_img;
139139

140140
extern int dump_my_file(int lfd, u32 *, int *type);
141-
extern int do_dump_gen_file(struct fd_parms *p, int lfd, const struct fdtype_ops *ops, FdinfoEntry *e);
141+
extern int do_dump_gen_file(struct fd_parms *p, int lfd,
142+
const struct fdtype_ops *ops, FdinfoEntry *e, bool force);
142143
struct parasite_drain_fd;
143144
int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item, struct parasite_drain_fd *dfds);
144145
int predump_task_files(int pid);
@@ -177,7 +178,8 @@ extern int close_old_fds(void);
177178
extern int shared_fdt_prepare(struct pstree_item *item);
178179

179180
extern struct collect_image_info ext_file_cinfo;
180-
extern int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more, char *info, FdinfoEntry *);
181+
extern int dump_unsupp_fd(struct fd_parms *p, int lfd, char *more,
182+
char *info, FdinfoEntry *, bool force);
181183

182184
extern int inherit_fd_parse(char *optarg);
183185
extern int inherit_fd_add(int fd, char *key);

criu/sockets.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -766,7 +766,7 @@ int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *e)
766766
return -1;
767767
}
768768

769-
return do_dump_gen_file(p, lfd, ops, e);
769+
return do_dump_gen_file(p, lfd, ops, e, false);
770770
}
771771

772772
static int inet_receive_one(struct nlmsghdr *h, struct ns_id *ns, void *arg)

plugins/amdgpu/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ endif
2727
criu-amdgpu.pb-c.c: criu-amdgpu.proto
2828
protoc --proto_path=. --c_out=. criu-amdgpu.proto
2929

30-
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
30+
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_dmabuf.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
3131
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
3232

3333
amdgpu_plugin_clean:

plugins/amdgpu/amdgpu_plugin.c

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "rst-malloc.h"
3939

4040
#include "common/list.h"
41+
#include "amdgpu_plugin_dmabuf.h"
4142
#include "amdgpu_plugin_drm.h"
4243
#include "amdgpu_plugin_util.h"
4344
#include "amdgpu_plugin_topology.h"
@@ -46,7 +47,7 @@
4647
#include "img-streamer.h"
4748
#include "image.h"
4849
#include "cr_options.h"
49-
50+
#include "util.h"
5051
struct vma_metadata {
5152
struct list_head list;
5253
uint64_t old_pgoff;
@@ -1403,7 +1404,17 @@ int amdgpu_plugin_dump_file(int fd, int id)
14031404
return -1;
14041405
}
14051406

1406-
/* Check whether this plugin was called for kfd or render nodes */
1407+
/* Check whether this plugin was called for kfd, dmabuf or render nodes */
1408+
ret = get_dmabuf_info(fd, &st);
1409+
if (ret < 0) {
1410+
pr_perror("Failed to get dmabuf info");
1411+
return -1;
1412+
} else if (ret == 0) {
1413+
pr_info("Dumping dmabuf fd = %d\n", fd);
1414+
ret = amdgpu_plugin_dmabuf_dump(fd, id, &st);
1415+
return ret;
1416+
}
1417+
14071418
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
14081419

14091420
/* This is RenderD dumper plugin, for now just save renderD
@@ -1415,9 +1426,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
14151426
return ret;
14161427

14171428
ret = record_dumped_fd(fd, true);
1418-
if (ret)
1419-
return ret;
1420-
14211429
/* Need to return success here so that criu can call plugins for renderD nodes */
14221430
return ret;
14231431
}
@@ -1541,7 +1549,6 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
15411549
int ret = 0, bucket_index = 0;
15421550

15431551
pr_debug("Restoring %d devices\n", e->num_of_gpus);
1544-
15451552
args->num_devices = e->num_of_gpus;
15461553
device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices);
15471554
if (!device_buckets)
@@ -1826,12 +1833,17 @@ int amdgpu_plugin_restore_file(int id, bool *retry_needed)
18261833
* first as we assume restore_maps is already filled. Need to fix this later.
18271834
*/
18281835
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
1829-
pr_info("Restoring RenderD %s\n", img_path);
18301836

18311837
img_fp = open_img_file(img_path, false, &img_size);
1832-
if (!img_fp)
1833-
return -EINVAL;
1834-
1838+
if (!img_fp) {
1839+
ret = amdgpu_plugin_dmabuf_restore(id);
1840+
if (ret == 1) {
1841+
*retry_needed = true;
1842+
return 0;
1843+
}
1844+
return ret;
1845+
}
1846+
pr_info("Restoring RenderD %s\n", img_path);
18351847
pr_debug("RenderD Image file size:%ld\n", img_size);
18361848
buf = xmalloc(img_size);
18371849
if (!buf) {

0 commit comments

Comments
 (0)