@@ -54,13 +54,6 @@ struct vma_metadata {
54
54
55
55
/************************************ Global Variables ********************************************/
56
56
57
- /**
58
- * FD of KFD device used to checkpoint. On a multi-process
59
- * tree the order of checkpointing goes from parent to child
60
- * and so on - so saving the FD will not be overwritten
61
- */
62
- static int kfd_checkpoint_fd ;
63
-
64
57
static LIST_HEAD (update_vma_info_list );
65
58
66
59
static LIST_HEAD (amdgpu_processes );
@@ -1018,28 +1011,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
1018
1011
return 0 ;
1019
1012
}
1020
1013
1021
- static int unpause_process (int fd )
1014
+ int amdgpu_unpause_processes (int pid )
1022
1015
{
1023
1016
int ret = 0 ;
1024
1017
struct kfd_ioctl_criu_args args = { 0 };
1018
+ struct list_head * l = get_dumped_fds ();
1019
+ struct dumped_fd * st ;
1025
1020
1026
- args .op = KFD_CRIU_OP_UNPAUSE ;
1021
+ list_for_each_entry (st , l , l ) {
1022
+ if (st -> is_drm ) {
1023
+ close (st -> fd );
1024
+ } else {
1025
+ args .op = KFD_CRIU_OP_UNPAUSE ;
1027
1026
1028
- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1029
- if (ret ) {
1030
- pr_perror ("Failed to unpause process" );
1031
- goto exit ;
1027
+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1028
+ if (ret ) {
1029
+ pr_perror ("Failed to unpause process" );
1030
+ goto exit ;
1031
+ }
1032
+ }
1032
1033
}
1033
1034
1034
- // Reset the KFD FD
1035
- kfd_checkpoint_fd = -1 ;
1036
- sys_close_drm_render_devices (& src_topology );
1037
-
1038
1035
exit :
1039
1036
pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1037
+ clear_dumped_fds ();
1040
1038
1041
1039
return ret ;
1042
1040
}
1041
+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
1043
1042
1044
1043
static void dmabuf_socket_name_gen (struct sockaddr_un * addr , int * len , int pid )
1045
1044
{
@@ -1359,9 +1358,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
1359
1358
return -1 ;
1360
1359
}
1361
1360
1362
- /* Initialize number of device files that will be checkpointed */
1363
- init_gpu_count (& src_topology );
1364
-
1365
1361
/* Check whether this plugin was called for kfd or render nodes */
1366
1362
if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
1367
1363
@@ -1373,11 +1369,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
1373
1369
if (ret )
1374
1370
return ret ;
1375
1371
1376
- /* Invoke unpause process if needed */
1377
- decrement_checkpoint_count ();
1378
- if (checkpoint_is_complete ()) {
1379
- ret = unpause_process (kfd_checkpoint_fd );
1380
- }
1372
+ ret = record_dumped_fd (fd , true);
1373
+ if (ret )
1374
+ return ret ;
1381
1375
1382
1376
/* Need to return success here so that criu can call plugins for renderD nodes */
1383
1377
return ret ;
@@ -1475,14 +1469,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
1475
1469
1476
1470
xfree (buf );
1477
1471
1478
- exit :
1479
- /* Restore all queues if conditions permit */
1480
- kfd_checkpoint_fd = fd ;
1481
- decrement_checkpoint_count ();
1482
- if (checkpoint_is_complete ()) {
1483
- ret = unpause_process (fd );
1484
- }
1472
+ ret = record_dumped_fd (fd , false);
1473
+ if (ret )
1474
+ goto exit ;
1485
1475
1476
+ exit :
1486
1477
xfree ((void * )args .devices );
1487
1478
xfree ((void * )args .bos );
1488
1479
xfree ((void * )args .priv_data );
0 commit comments