@@ -54,13 +54,6 @@ struct vma_metadata {
54
54
55
55
/************************************ Global Variables ********************************************/
56
56
57
- /**
58
- * FD of KFD device used to checkpoint. On a multi-process
59
- * tree the order of checkpointing goes from parent to child
60
- * and so on - so saving the FD will not be overwritten
61
- */
62
- static int kfd_checkpoint_fd ;
63
-
64
57
static LIST_HEAD (update_vma_info_list );
65
58
66
59
static LIST_HEAD (amdgpu_processes );
@@ -1018,28 +1011,39 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
1018
1011
return 0 ;
1019
1012
}
1020
1013
1021
- static int unpause_process (int fd )
1014
+ int amdgpu_unpause_processes (int pid )
1022
1015
{
1023
1016
int ret = 0 ;
1024
1017
struct kfd_ioctl_criu_args args = { 0 };
1018
+ struct list_head * l = get_dumped_fds ();
1019
+ struct dumped_fd * st ;
1020
+
1021
+ list_for_each_entry (st , l , l ) {
1022
+ if (st -> is_drm ) {
1023
+ ret = amdgpu_plugin_drm_unpause_file (st -> fd );
1024
+ if (ret ) {
1025
+ pr_perror ("Failed to unpause drm device file" );
1026
+ goto exit ;
1027
+ }
1028
+ close (st -> fd );
1029
+ } else {
1030
+ args .op = KFD_CRIU_OP_UNPAUSE ;
1025
1031
1026
- args . op = KFD_CRIU_OP_UNPAUSE ;
1027
-
1028
- ret = kmtIoctl ( fd , AMDKFD_IOC_CRIU_OP , & args );
1029
- if ( ret ) {
1030
- pr_perror ( "Failed to unpause process" );
1031
- goto exit ;
1032
+ ret = kmtIoctl ( st -> fd , AMDKFD_IOC_CRIU_OP , & args ) ;
1033
+ if ( ret ) {
1034
+ pr_perror ( "Failed to unpause process" );
1035
+ goto exit ;
1036
+ }
1037
+ }
1032
1038
}
1033
1039
1034
- // Reset the KFD FD
1035
- kfd_checkpoint_fd = -1 ;
1036
- sys_close_drm_render_devices (& src_topology );
1037
-
1038
1040
exit :
1039
1041
pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1042
+ clear_dumped_fds ();
1040
1043
1041
1044
return ret ;
1042
1045
}
1046
+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
1043
1047
1044
1048
static void dmabuf_socket_name_gen (struct sockaddr_un * addr , int * len , int pid )
1045
1049
{
@@ -1359,9 +1363,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
1359
1363
return -1 ;
1360
1364
}
1361
1365
1362
- /* Initialize number of device files that will be checkpointed */
1363
- init_gpu_count (& src_topology );
1364
-
1365
1366
/* Check whether this plugin was called for kfd or render nodes */
1366
1367
if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
1367
1368
@@ -1373,11 +1374,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
1373
1374
if (ret )
1374
1375
return ret ;
1375
1376
1376
- /* Invoke unpause process if needed */
1377
- decrement_checkpoint_count ();
1378
- if (checkpoint_is_complete ()) {
1379
- ret = unpause_process (kfd_checkpoint_fd );
1380
- }
1377
+ ret = record_dumped_fd (fd , true);
1378
+ if (ret )
1379
+ return ret ;
1381
1380
1382
1381
/* Need to return success here so that criu can call plugins for renderD nodes */
1383
1382
return ret ;
@@ -1475,14 +1474,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
1475
1474
1476
1475
xfree (buf );
1477
1476
1478
- exit :
1479
- /* Restore all queues if conditions permit */
1480
- kfd_checkpoint_fd = fd ;
1481
- decrement_checkpoint_count ();
1482
- if (checkpoint_is_complete ()) {
1483
- ret = unpause_process (fd );
1484
- }
1477
+ ret = record_dumped_fd (fd , false);
1478
+ if (ret )
1479
+ goto exit ;
1485
1480
1481
+ exit :
1486
1482
xfree ((void * )args .devices );
1487
1483
xfree ((void * )args .bos );
1488
1484
xfree ((void * )args .priv_data );
0 commit comments