Skip to content

Commit 84fb396

Browse files
committed
plugin: Add DUMP_DEVICES_LATE callback
The amdgpu plugin was counting how many files were checkpointed to determine when it should close the device files. The number of device files is not consistent; a process may have multiple copies of the drm device files open. Instead of doing this counting, add a new callback after all files are checkpointed, so plugins can clean up their resources at an appropriate time. Signed-off-by: David Francis <David.Francis@amd.com>
1 parent 52f84f1 commit 84fb396

File tree

6 files changed

+58
-52
lines changed

6 files changed

+58
-52
lines changed

criu/cr-dump.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2228,6 +2228,10 @@ int cr_dump_tasks(pid_t pid)
22282228
goto err;
22292229
}
22302230

2231+
ret = run_plugins(DUMP_DEVICES_LATE, pid);
2232+
if(ret && ret != -ENOTSUP)
2233+
goto err;
2234+
22312235
if (parent_ie) {
22322236
inventory_entry__free_unpacked(parent_ie, NULL);
22332237
parent_ie = NULL;

criu/include/criu-plugin.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ enum {
6464

6565
CR_PLUGIN_HOOK__RESTORE_INIT = 13,
6666

67+
CR_PLUGIN_HOOK__DUMP_DEVICES_LATE = 14,
68+
6769
CR_PLUGIN_HOOK__MAX
6870
};
6971

@@ -84,6 +86,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
8486
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
8587
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);
8688
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_INIT, void);
89+
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, int id);
8790

8891
enum {
8992
CR_PLUGIN_STAGE__DUMP,

criu/plugin.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
6161
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
6262
__assign_hook(POST_FORKING, "cr_plugin_post_forking");
6363
__assign_hook(RESTORE_INIT, "cr_plugin_restore_init");
64+
__assign_hook(DUMP_DEVICES_LATE, "cr_plugin_dump_devices_late");
6465

6566
#undef __assign_hook
6667

plugins/amdgpu/amdgpu_plugin.c

Lines changed: 23 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,6 @@ struct vma_metadata {
5858

5959
/************************************ Global Variables ********************************************/
6060

61-
/**
62-
* FD of KFD device used to checkpoint. On a multi-process
63-
* tree the order of checkpointing goes from parent to child
64-
* and so on - so saving the FD will not be overwritten
65-
*/
66-
static int kfd_checkpoint_fd;
67-
6861
static LIST_HEAD(update_vma_info_list);
6962

7063
size_t kfd_max_buffer_size;
@@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10501043
return 0;
10511044
}
10521045

1053-
static int unpause_process(int fd)
1046+
int amdgpu_unpause_processes(int pid)
10541047
{
10551048
int ret = 0;
10561049
struct kfd_ioctl_criu_args args = { 0 };
1050+
struct list_head *l = get_dumped_fds();
1051+
struct dumped_fd *st;
10571052

1058-
args.op = KFD_CRIU_OP_UNPAUSE;
1053+
list_for_each_entry(st, l, l) {
1054+
if (st->is_drm) {
1055+
close(st->fd);
1056+
} else {
1057+
args.op = KFD_CRIU_OP_UNPAUSE;
10591058

1060-
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
1061-
if (ret) {
1062-
pr_perror("Failed to unpause process");
1063-
goto exit;
1059+
ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
1060+
if (ret) {
1061+
pr_perror("Failed to unpause process");
1062+
goto exit;
1063+
}
1064+
}
10641065
}
10651066

1066-
// Reset the KFD FD
1067-
kfd_checkpoint_fd = -1;
1068-
sys_close_drm_render_devices(&src_topology);
1069-
10701067
exit:
10711068
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
1069+
clear_dumped_fds();
10721070

10731071
return ret;
10741072
}
1073+
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICES_LATE, amdgpu_unpause_processes)
10751074

10761075
int store_dmabuf_fd(int handle, int fd)
10771076
{
@@ -1404,9 +1403,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
14041403
return -1;
14051404
}
14061405

1407-
/* Initialize number of device files that will be checkpointed */
1408-
init_gpu_count(&src_topology);
1409-
14101406
/* Check whether this plugin was called for kfd or render nodes */
14111407
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
14121408

@@ -1418,11 +1414,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
14181414
if (ret)
14191415
return ret;
14201416

1421-
/* Invoke unpause process if needed */
1422-
decrement_checkpoint_count();
1423-
if (checkpoint_is_complete()) {
1424-
ret = unpause_process(kfd_checkpoint_fd);
1425-
}
1417+
ret = record_dumped_fd(fd, true);
1418+
if (ret)
1419+
return ret;
14261420

14271421
/* Need to return success here so that criu can call plugins for renderD nodes */
14281422
return ret;
@@ -1520,14 +1514,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
15201514

15211515
xfree(buf);
15221516

1523-
exit:
1524-
/* Restore all queues if conditions permit */
1525-
kfd_checkpoint_fd = fd;
1526-
decrement_checkpoint_count();
1527-
if (checkpoint_is_complete()) {
1528-
ret = unpause_process(fd);
1529-
}
1517+
ret = record_dumped_fd(fd, false);
1518+
if (ret)
1519+
goto exit;
15301520

1521+
exit:
15311522
xfree((void *)args.devices);
15321523
xfree((void *)args.bos);
15331524
xfree((void *)args.priv_data);

plugins/amdgpu/amdgpu_plugin_util.c

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@
3838
#include "amdgpu_plugin_util.h"
3939
#include "amdgpu_plugin_topology.h"
4040

41-
/* Tracks number of device files that need to be checkpointed */
42-
static int dev_file_cnt = 0;
43-
41+
static LIST_HEAD(dumped_fds);
4442
static LIST_HEAD(shared_bos);
4543
static LIST_HEAD(shared_dmabuf_fds);
4644
static LIST_HEAD(completed_work);
@@ -53,23 +51,23 @@ struct tp_system dest_topology;
5351
struct device_maps checkpoint_maps;
5452
struct device_maps restore_maps;
5553

56-
bool checkpoint_is_complete()
57-
{
58-
return (dev_file_cnt == 0);
59-
}
54+
int record_dumped_fd(int fd, bool is_drm) {
55+
int newfd = dup(fd);
6056

61-
void decrement_checkpoint_count()
62-
{
63-
dev_file_cnt--;
64-
}
57+
if (newfd < 0)
58+
return newfd;
59+
struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
60+
if (!st)
61+
return -1;
62+
st->fd = newfd;
63+
st->is_drm = is_drm;
64+
list_add(&st->l, &dumped_fds);
6565

66-
void init_gpu_count(struct tp_system *topo)
67-
{
68-
if (dev_file_cnt != 0)
69-
return;
66+
return 0;
67+
}
7068

71-
/* We add ONE to include checkpointing of KFD device */
72-
dev_file_cnt = 1 + topology_gpu_count(topo);
69+
struct list_head *get_dumped_fds() {
70+
return &dumped_fds;
7371
}
7472

7573
bool shared_bo_has_exporter(int handle) {
@@ -173,6 +171,15 @@ void clear_restore_state() {
173171
}
174172
}
175173

174+
void clear_dumped_fds() {
175+
while (!list_empty(&dumped_fds)) {
176+
struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
177+
list_del(&st->l);
178+
close(st->fd);
179+
free(st);
180+
}
181+
}
182+
176183
int read_fp(FILE *fp, void *buf, const size_t buf_len)
177184
{
178185
size_t len_read;

plugins/amdgpu/amdgpu_plugin_util.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
123123
int write_img_file(char *path, const void *buf, const size_t buf_len);
124124
FILE *open_img_file(char *path, bool write, size_t *size);
125125

126-
bool checkpoint_is_complete();
127-
void decrement_checkpoint_count();
128-
void init_gpu_count(struct tp_system *topology);
126+
int record_dumped_fd(int fd, bool is_drm);
127+
struct list_head *get_dumped_fds();
128+
void clear_dumped_fds();
129129

130130
bool shared_bo_has_exporter(int handle);
131131
int record_shared_bo(int handle, bool is_imported);

0 commit comments

Comments
 (0)