@@ -49,13 +49,6 @@ struct vma_metadata {
49
49
50
50
/************************************ Global Variables ********************************************/
51
51
52
- /**
53
- * FD of KFD device used to checkpoint. On a multi-process
54
- * tree the order of checkpointing goes from parent to child
55
- * and so on - so saving the FD will not be overwritten
56
- */
57
- static int kfd_checkpoint_fd ;
58
-
59
52
static LIST_HEAD (update_vma_info_list );
60
53
61
54
size_t kfd_max_buffer_size ;
@@ -1007,28 +1000,39 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
1007
1000
return 0 ;
1008
1001
}
1009
1002
1010
- static int unpause_process (int fd )
1003
+ int amdgpu_unpause_processes (int pid )
1011
1004
{
1012
1005
int ret = 0 ;
1013
1006
struct kfd_ioctl_criu_args args = { 0 };
1007
+ struct list_head * l = get_dumped_fds ();
1008
+ struct dumped_fd * st ;
1009
+
1010
+ list_for_each_entry (st , l , l ) {
1011
+ if (st -> is_drm ) {
1012
+ ret = amdgpu_plugin_drm_unpause_file (st -> fd );
1013
+ if (ret ) {
1014
+ pr_perror ("Failed to unpause drm device file" );
1015
+ goto exit ;
1016
+ }
1017
+ close (st -> fd );
1018
+ } else {
1019
+ args .op = KFD_CRIU_OP_UNPAUSE ;
1014
1020
1015
- args . op = KFD_CRIU_OP_UNPAUSE ;
1016
-
1017
- ret = kmtIoctl ( fd , AMDKFD_IOC_CRIU_OP , & args );
1018
- if ( ret ) {
1019
- pr_perror ( "Failed to unpause process" );
1020
- goto exit ;
1021
+ ret = kmtIoctl ( st -> fd , AMDKFD_IOC_CRIU_OP , & args ) ;
1022
+ if ( ret ) {
1023
+ pr_perror ( "Failed to unpause process" );
1024
+ goto exit ;
1025
+ }
1026
+ }
1021
1027
}
1022
1028
1023
- // Reset the KFD FD
1024
- kfd_checkpoint_fd = -1 ;
1025
- sys_close_drm_render_devices (& src_topology );
1026
-
1027
1029
exit :
1028
1030
pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1031
+ clear_dumped_fds ();
1029
1032
1030
1033
return ret ;
1031
1034
}
1035
+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICE_LATE , amdgpu_unpause_processes )
1032
1036
1033
1037
static int save_devices (int fd , struct kfd_ioctl_criu_args * args , struct kfd_criu_device_bucket * device_buckets ,
1034
1038
CriuKfd * e )
@@ -1230,9 +1234,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
1230
1234
return -1 ;
1231
1235
}
1232
1236
1233
- /* Initialize number of device files that will be checkpointed */
1234
- init_gpu_count (& src_topology );
1235
-
1236
1237
/* Check whether this plugin was called for kfd or render nodes */
1237
1238
if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
1238
1239
@@ -1244,11 +1245,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
1244
1245
if (ret )
1245
1246
return ret ;
1246
1247
1247
- /* Invoke unpause process if needed */
1248
- decrement_checkpoint_count ();
1249
- if (checkpoint_is_complete ()) {
1250
- ret = unpause_process (kfd_checkpoint_fd );
1251
- }
1248
+ ret = record_dumped_fd (fd , true);
1249
+ if (ret )
1250
+ return ret ;
1252
1251
1253
1252
/* Need to return success here so that criu can call plugins for renderD nodes */
1254
1253
return ret ;
@@ -1346,14 +1345,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
1346
1345
1347
1346
xfree (buf );
1348
1347
1349
- exit :
1350
- /* Restore all queues if conditions permit */
1351
- kfd_checkpoint_fd = fd ;
1352
- decrement_checkpoint_count ();
1353
- if (checkpoint_is_complete ()) {
1354
- ret = unpause_process (fd );
1355
- }
1348
+ ret = record_dumped_fd (fd , false);
1349
+ if (ret )
1350
+ goto exit ;
1356
1351
1352
+ exit :
1357
1353
xfree ((void * )args .devices );
1358
1354
xfree ((void * )args .bos );
1359
1355
xfree ((void * )args .priv_data );
0 commit comments