Skip to content

Commit 456978a

Browse files
committed
plugin: Add DUMP_DEVICE_LATE callback
The amdgpu plugin was counting how many files were checkpointed to determine when it should close the device files. The number of device files is not consistent; a process may have multiple copies of the drm device files open. Instead of doing this counting, add a new callback after all files are checkpointed, so plugins can clean up their resources at an appropriate time. Signed-off-by: David Francis <David.Francis@amd.com>
1 parent c1070e0 commit 456978a

File tree

6 files changed

+58
-53
lines changed

6 files changed

+58
-53
lines changed

criu/cr-dump.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2225,6 +2225,10 @@ int cr_dump_tasks(pid_t pid)
22252225
goto err;
22262226
}
22272227

2228+
ret = run_plugins(DUMP_DEVICE_LATE, pid);
2229+
if(ret && ret != -ENOTSUP)
2230+
goto err;
2231+
22282232
if (parent_ie) {
22292233
inventory_entry__free_unpacked(parent_ie, NULL);
22302234
parent_ie = NULL;

criu/include/criu-plugin.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ enum {
6464

6565
CR_PLUGIN_HOOK__COLLECT_FILE = 13,
6666

67+
CR_PLUGIN_HOOK__DUMP_DEVICE_LATE = 14,
68+
6769
CR_PLUGIN_HOOK__MAX
6870
};
6971

@@ -84,7 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
8486
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
8587
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_EARLY, int pid);
8688
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__COLLECT_FILE, int pid, int fd);
87-
89+
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICE_LATE, int id);
8890

8991
enum {
9092
CR_PLUGIN_STAGE__DUMP,

criu/plugin.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
6161
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
6262
__assign_hook(RESUME_DEVICES_EARLY, "cr_plugin_resume_devices_early");
6363
__assign_hook(COLLECT_FILE, "cr_plugin_collect_file");
64+
__assign_hook(DUMP_DEVICE_LATE, "cr_plugin_dump_device_late");
6465

6566
#undef __assign_hook
6667

plugins/amdgpu/amdgpu_plugin.c

Lines changed: 23 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -54,13 +54,6 @@ struct vma_metadata {
5454

5555
/************************************ Global Variables ********************************************/
5656

57-
/**
58-
* FD of KFD device used to checkpoint. On a multi-process
59-
* tree the order of checkpointing goes from parent to child
60-
* and so on - so saving the FD will not be overwritten
61-
*/
62-
static int kfd_checkpoint_fd;
63-
6457
static LIST_HEAD(update_vma_info_list);
6558

6659
static LIST_HEAD(amdgpu_processes);
@@ -1018,28 +1011,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10181011
return 0;
10191012
}
10201013

1021-
static int unpause_process(int fd)
1014+
int amdgpu_unpause_processes(int pid)
10221015
{
10231016
int ret = 0;
10241017
struct kfd_ioctl_criu_args args = { 0 };
1018+
struct list_head *l = get_dumped_fds();
1019+
struct dumped_fd *st;
10251020

1026-
args.op = KFD_CRIU_OP_UNPAUSE;
1021+
list_for_each_entry(st, l, l) {
1022+
if (st->is_drm) {
1023+
close(st->fd);
1024+
} else {
1025+
args.op = KFD_CRIU_OP_UNPAUSE;
10271026

1028-
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
1029-
if (ret) {
1030-
pr_perror("Failed to unpause process");
1031-
goto exit;
1027+
ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
1028+
if (ret) {
1029+
pr_perror("Failed to unpause process");
1030+
goto exit;
1031+
}
1032+
}
10321033
}
10331034

1034-
// Reset the KFD FD
1035-
kfd_checkpoint_fd = -1;
1036-
sys_close_drm_render_devices(&src_topology);
1037-
10381035
exit:
10391036
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
1037+
clear_dumped_fds();
10401038

10411039
return ret;
10421040
}
1041+
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICE_LATE, amdgpu_unpause_processes)
10431042

10441043
static void dmabuf_socket_name_gen(struct sockaddr_un *addr, int *len, int pid)
10451044
{
@@ -1359,9 +1358,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
13591358
return -1;
13601359
}
13611360

1362-
/* Initialize number of device files that will be checkpointed */
1363-
init_gpu_count(&src_topology);
1364-
13651361
/* Check whether this plugin was called for kfd or render nodes */
13661362
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
13671363

@@ -1373,11 +1369,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
13731369
if (ret)
13741370
return ret;
13751371

1376-
/* Invoke unpause process if needed */
1377-
decrement_checkpoint_count();
1378-
if (checkpoint_is_complete()) {
1379-
ret = unpause_process(kfd_checkpoint_fd);
1380-
}
1372+
ret = record_dumped_fd(fd, true);
1373+
if (ret)
1374+
return ret;
13811375

13821376
/* Need to return success here so that criu can call plugins for renderD nodes */
13831377
return ret;
@@ -1475,14 +1469,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
14751469

14761470
xfree(buf);
14771471

1478-
exit:
1479-
/* Restore all queues if conditions permit */
1480-
kfd_checkpoint_fd = fd;
1481-
decrement_checkpoint_count();
1482-
if (checkpoint_is_complete()) {
1483-
ret = unpause_process(fd);
1484-
}
1472+
ret = record_dumped_fd(fd, false);
1473+
if (ret)
1474+
goto exit;
14851475

1476+
exit:
14861477
xfree((void *)args.devices);
14871478
xfree((void *)args.bos);
14881479
xfree((void *)args.priv_data);

plugins/amdgpu/amdgpu_plugin_util.c

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@
3838
#include "amdgpu_plugin_util.h"
3939
#include "amdgpu_plugin_topology.h"
4040

41-
/* Tracks number of device files that need to be checkpointed */
42-
static int dev_file_cnt = 0;
43-
41+
static LIST_HEAD(dumped_fds);
4442
static LIST_HEAD(shared_bos);
4543
static LIST_HEAD(shared_dmabuf_fds);
4644
static LIST_HEAD(completed_work);
@@ -53,23 +51,23 @@ struct tp_system dest_topology;
5351
struct device_maps checkpoint_maps;
5452
struct device_maps restore_maps;
5553

56-
bool checkpoint_is_complete()
57-
{
58-
return (dev_file_cnt == 0);
59-
}
54+
int record_dumped_fd(int fd, bool is_drm) {
55+
int newfd = dup(fd);
6056

61-
void decrement_checkpoint_count()
62-
{
63-
dev_file_cnt--;
64-
}
57+
if (newfd < 0)
58+
return newfd;
59+
struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
60+
if (!st)
61+
return -1;
62+
st->fd = newfd;
63+
st->is_drm = is_drm;
64+
list_add(&st->l, &dumped_fds);
6565

66-
void init_gpu_count(struct tp_system *topo)
67-
{
68-
if (dev_file_cnt != 0)
69-
return;
66+
return 0;
67+
}
7068

71-
/* We add ONE to include checkpointing of KFD device */
72-
dev_file_cnt = 1 + topology_gpu_count(topo);
69+
struct list_head *get_dumped_fds() {
70+
return &dumped_fds;
7371
}
7472

7573
bool shared_bo_has_exporter(int handle) {
@@ -173,6 +171,15 @@ void clear_restore_state() {
173171
}
174172
}
175173

174+
void clear_dumped_fds() {
175+
while (!list_empty(&dumped_fds)) {
176+
struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
177+
list_del(&st->l);
178+
close(st->fd);
179+
free(st);
180+
}
181+
}
182+
176183
int read_fp(FILE *fp, void *buf, const size_t buf_len)
177184
{
178185
size_t len_read;

plugins/amdgpu/amdgpu_plugin_util.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
128128
int write_img_file(char *path, const void *buf, const size_t buf_len);
129129
FILE *open_img_file(char *path, bool write, size_t *size);
130130

131-
bool checkpoint_is_complete();
132-
void decrement_checkpoint_count();
133-
void init_gpu_count(struct tp_system *topology);
131+
int record_dumped_fd(int fd, bool is_drm);
132+
struct list_head *get_dumped_fds();
133+
void clear_dumped_fds();
134134

135135
bool shared_bo_has_exporter(int handle);
136136
int record_shared_bo(int handle, bool is_imported);

0 commit comments

Comments
 (0)