Skip to content

Commit 50b59d4

Browse files
committed
plugin: Add DUMP_DEVICE_LATE callback
Previously, amdgpu plugin was determining when to call its UNPAUSE ioctl by counting the files that have been restored. This was not reliable; there may be more or fewer device files than expected and there may be other processes still checkpointing when unpause was called. Add a new plugin callback DUMP_DEVICE_LATE which is called after files are finished checkpointing for all processes. Signed-off-by: David Francis <David.Francis@amd.com>
1 parent 90fd03d commit 50b59d4

File tree

8 files changed

+81
-53
lines changed

8 files changed

+81
-53
lines changed

criu/cr-dump.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2225,6 +2225,9 @@ int cr_dump_tasks(pid_t pid)
22252225
goto err;
22262226
}
22272227

2228+
if(run_plugins(DUMP_DEVICE_LATE, pid))
2229+
goto err;
2230+
22282231
if (parent_ie) {
22292232
inventory_entry__free_unpacked(parent_ie, NULL);
22302233
parent_ie = NULL;

criu/include/criu-plugin.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ enum {
6262

6363
CR_PLUGIN_HOOK__DMABUF_FD = 12,
6464

65+
CR_PLUGIN_HOOK__DUMP_DEVICE_LATE = 13,
66+
6567
CR_PLUGIN_HOOK__MAX
6668
};
6769

@@ -81,6 +83,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
8183
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
8284
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
8385
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DMABUF_FD, int handle, int fd);
86+
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_DEVICE_LATE, int id);
8487

8588
enum {
8689
CR_PLUGIN_STAGE__DUMP,

criu/plugin.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
6060
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
6161
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
6262
__assign_hook(DMABUF_FD, "cr_plugin_dmabuf_fd");
63+
__assign_hook(DUMP_DEVICE_LATE, "cr_plugin_dump_device_late");
6364

6465
#undef __assign_hook
6566

plugins/amdgpu/amdgpu_plugin.c

Lines changed: 29 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,6 @@ struct vma_metadata {
4949

5050
/************************************ Global Variables ********************************************/
5151

52-
/**
53-
* FD of KFD device used to checkpoint. On a multi-process
54-
* tree the order of checkpointing goes from parent to child
55-
* and so on - so saving the FD will not be overwritten
56-
*/
57-
static int kfd_checkpoint_fd;
58-
5952
static LIST_HEAD(update_vma_info_list);
6053

6154
size_t kfd_max_buffer_size;
@@ -1007,28 +1000,39 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
10071000
return 0;
10081001
}
10091002

1010-
static int unpause_process(int fd)
1003+
int amdgpu_unpause_processes(int pid)
10111004
{
10121005
int ret = 0;
10131006
struct kfd_ioctl_criu_args args = { 0 };
1007+
struct list_head *l = get_dumped_fds();
1008+
struct dumped_fd *st;
1009+
1010+
list_for_each_entry(st, l, l) {
1011+
if (st->is_drm) {
1012+
ret = amdgpu_plugin_drm_unpause_file(st->fd);
1013+
if (ret) {
1014+
pr_perror("Failed to unpause drm device file");
1015+
goto exit;
1016+
}
1017+
close(st->fd);
1018+
} else {
1019+
args.op = KFD_CRIU_OP_UNPAUSE;
10141020

1015-
args.op = KFD_CRIU_OP_UNPAUSE;
1016-
1017-
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
1018-
if (ret) {
1019-
pr_perror("Failed to unpause process");
1020-
goto exit;
1021+
ret = kmtIoctl(st->fd, AMDKFD_IOC_CRIU_OP, &args);
1022+
if (ret) {
1023+
pr_perror("Failed to unpause process");
1024+
goto exit;
1025+
}
1026+
}
10211027
}
10221028

1023-
// Reset the KFD FD
1024-
kfd_checkpoint_fd = -1;
1025-
sys_close_drm_render_devices(&src_topology);
1026-
10271029
exit:
10281030
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
1031+
clear_dumped_fds();
10291032

10301033
return ret;
10311034
}
1035+
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_DEVICE_LATE, amdgpu_unpause_processes)
10321036

10331037
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
10341038
CriuKfd *e)
@@ -1230,9 +1234,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
12301234
return -1;
12311235
}
12321236

1233-
/* Initialize number of device files that will be checkpointed */
1234-
init_gpu_count(&src_topology);
1235-
12361237
/* Check whether this plugin was called for kfd or render nodes */
12371238
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
12381239

@@ -1244,11 +1245,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
12441245
if (ret)
12451246
return ret;
12461247

1247-
/* Invoke unpause process if needed */
1248-
decrement_checkpoint_count();
1249-
if (checkpoint_is_complete()) {
1250-
ret = unpause_process(kfd_checkpoint_fd);
1251-
}
1248+
ret = record_dumped_fd(fd, true);
1249+
if (ret)
1250+
return ret;
12521251

12531252
/* Need to return success here so that criu can call plugins for renderD nodes */
12541253
return ret;
@@ -1346,14 +1345,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
13461345

13471346
xfree(buf);
13481347

1349-
exit:
1350-
/* Restore all queues if conditions permit */
1351-
kfd_checkpoint_fd = fd;
1352-
decrement_checkpoint_count();
1353-
if (checkpoint_is_complete()) {
1354-
ret = unpause_process(fd);
1355-
}
1348+
ret = record_dumped_fd(fd, false);
1349+
if (ret)
1350+
goto exit;
13561351

1352+
exit:
13571353
xfree((void *)args.devices);
13581354
xfree((void *)args.bos);
13591355
xfree((void *)args.priv_data);

plugins/amdgpu/amdgpu_plugin_drm.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,22 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
349349
return ret;
350350
}
351351

352+
int amdgpu_plugin_drm_unpause_file(int fd)
353+
{
354+
struct drm_amdgpu_criu_args args = {0};
355+
int ret = 0;
356+
357+
args.op = AMDGPU_CRIU_OP_UNPAUSE;
358+
if (drmIoctl(fd, DRM_IOCTL_AMDGPU_CRIU_OP, &args) == -1) {
359+
pr_perror("Failed to call unpause ioctl");
360+
ret = -1;
361+
goto exit;
362+
}
363+
364+
exit:
365+
return ret;
366+
}
367+
352368
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd)
353369
{
354370
struct drm_amdgpu_criu_args args = {0};

plugins/amdgpu/amdgpu_plugin_drm.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
2626

2727
int amdgpu_plugin_drm_restore_file(int fd, CriuRenderNode *rd);
2828

29+
int amdgpu_plugin_drm_unpause_file(int fd);
30+
2931
int get_gem_handle(amdgpu_device_handle h_dev, int dmabuf_fd);
3032

3133
int save_vma_updates(uint64_t offset, uint64_t addr, uint64_t restored_offset, int gpu_id);

plugins/amdgpu/amdgpu_plugin_util.c

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@
3838
#include "amdgpu_plugin_util.h"
3939
#include "amdgpu_plugin_topology.h"
4040

41-
/* Tracks number of device files that need to be checkpointed */
42-
static int dev_file_cnt = 0;
43-
41+
static LIST_HEAD(dumped_fds);
4442
static LIST_HEAD(shared_bos);
4543
static LIST_HEAD(shared_dmabuf_fds);
4644
static LIST_HEAD(completed_work);
@@ -53,23 +51,23 @@ struct tp_system dest_topology;
5351
struct device_maps checkpoint_maps;
5452
struct device_maps restore_maps;
5553

56-
bool checkpoint_is_complete()
57-
{
58-
return (dev_file_cnt == 0);
59-
}
54+
int record_dumped_fd(int fd, bool is_drm) {
55+
int newfd = dup(fd);
6056

61-
void decrement_checkpoint_count()
62-
{
63-
dev_file_cnt--;
64-
}
57+
if (newfd < 0)
58+
return newfd;
59+
struct dumped_fd *st = malloc(sizeof(struct dumped_fd));
60+
if (!st)
61+
return -1;
62+
st->fd = newfd;
63+
st->is_drm = is_drm;
64+
list_add(&st->l, &dumped_fds);
6565

66-
void init_gpu_count(struct tp_system *topo)
67-
{
68-
if (dev_file_cnt != 0)
69-
return;
66+
return 0;
67+
}
7068

71-
/* We add ONE to include checkpointing of KFD device */
72-
dev_file_cnt = 1 + topology_gpu_count(topo);
69+
struct list_head *get_dumped_fds() {
70+
return &dumped_fds;
7371
}
7472

7573
bool shared_bo_has_exporter(int handle) {
@@ -174,6 +172,15 @@ void clear_completed_work_and_dmabuf_fds() {
174172
}
175173
}
176174

175+
void clear_dumped_fds() {
176+
while (!list_empty(&dumped_fds)) {
177+
struct dumped_fd *st = list_first_entry(&dumped_fds, struct dumped_fd, l);
178+
list_del(&st->l);
179+
close(st->fd);
180+
free(st);
181+
}
182+
}
183+
177184
int read_fp(FILE *fp, void *buf, const size_t buf_len)
178185
{
179186
size_t len_read;

plugins/amdgpu/amdgpu_plugin_util.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,9 @@ int read_file(const char *file_path, void *buf, const size_t buf_len);
123123
int write_img_file(char *path, const void *buf, const size_t buf_len);
124124
FILE *open_img_file(char *path, bool write, size_t *size);
125125

126-
bool checkpoint_is_complete();
127-
void decrement_checkpoint_count();
128-
void init_gpu_count(struct tp_system *topology);
126+
int record_dumped_fd(int fd, bool is_drm);
127+
struct list_head *get_dumped_fds();
128+
void clear_dumped_fds();
129129

130130
bool shared_bo_has_exporter(int handle);
131131
int record_shared_bo(int handle, bool is_imported);

0 commit comments

Comments
 (0)