@@ -57,13 +57,6 @@ struct vma_metadata {
57
57
58
58
/************************************ Global Variables ********************************************/
59
59
60
- /**
61
- * FD of KFD device used to checkpoint. On a multi-process
62
- * tree the order of checkpointing goes from parent to child
63
- * and so on - so saving the FD will not be overwritten
64
- */
65
- static int kfd_checkpoint_fd ;
66
-
67
60
static LIST_HEAD (update_vma_info_list );
68
61
69
62
static LIST_HEAD (amdgpu_processes );
@@ -1041,28 +1034,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
1041
1034
return 0 ;
1042
1035
}
1043
1036
1044
- static int unpause_process (int fd )
1037
+ int amdgpu_unpause_processes (int pid )
1045
1038
{
1046
1039
int ret = 0 ;
1047
1040
struct kfd_ioctl_criu_args args = { 0 };
1041
+ struct list_head * l = get_dumped_fds ();
1042
+ struct dumped_fd * st ;
1048
1043
1049
- args .op = KFD_CRIU_OP_UNPAUSE ;
1044
+ list_for_each_entry (st , l , l ) {
1045
+ if (st -> is_drm ) {
1046
+ close (st -> fd );
1047
+ } else {
1048
+ args .op = KFD_CRIU_OP_UNPAUSE ;
1050
1049
1051
- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1052
- if (ret ) {
1053
- pr_perror ("Failed to unpause process" );
1054
- goto exit ;
1050
+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1051
+ if (ret ) {
1052
+ pr_perror ("Failed to unpause process" );
1053
+ goto exit ;
1054
+ }
1055
+ }
1055
1056
}
1056
1057
1057
- // Reset the KFD FD
1058
- kfd_checkpoint_fd = -1 ;
1059
- sys_close_drm_render_devices (& src_topology );
1060
-
1061
1058
exit :
1062
1059
pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1060
+ clear_dumped_fds ();
1063
1061
1064
1062
return ret ;
1065
1063
}
1064
+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
1066
1065
1067
1066
static void dmabuf_socket_name_gen (struct sockaddr_un * addr , int * len , int pid )
1068
1067
{
@@ -1382,9 +1381,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
1382
1381
return -1 ;
1383
1382
}
1384
1383
1385
- /* Initialize number of device files that will be checkpointed */
1386
- init_gpu_count (& src_topology );
1387
-
1388
1384
/* Check whether this plugin was called for kfd or render nodes */
1389
1385
if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
1390
1386
@@ -1396,11 +1392,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
1396
1392
if (ret )
1397
1393
return ret ;
1398
1394
1399
- /* Invoke unpause process if needed */
1400
- decrement_checkpoint_count ();
1401
- if (checkpoint_is_complete ()) {
1402
- ret = unpause_process (kfd_checkpoint_fd );
1403
- }
1395
+ ret = record_dumped_fd (fd , true);
1396
+ if (ret )
1397
+ return ret ;
1404
1398
1405
1399
/* Need to return success here so that criu can call plugins for renderD nodes */
1406
1400
return ret ;
@@ -1498,14 +1492,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
1498
1492
1499
1493
xfree (buf );
1500
1494
1501
- exit :
1502
- /* Restore all queues if conditions permit */
1503
- kfd_checkpoint_fd = fd ;
1504
- decrement_checkpoint_count ();
1505
- if (checkpoint_is_complete ()) {
1506
- ret = unpause_process (fd );
1507
- }
1495
+ ret = record_dumped_fd (fd , false);
1496
+ if (ret )
1497
+ goto exit ;
1508
1498
1499
+ exit :
1509
1500
xfree ((void * )args .devices );
1510
1501
xfree ((void * )args .bos );
1511
1502
xfree ((void * )args .priv_data );
0 commit comments