@@ -58,13 +58,6 @@ struct vma_metadata {
58
58
59
59
/************************************ Global Variables ********************************************/
60
60
61
- /**
62
- * FD of KFD device used to checkpoint. On a multi-process
63
- * tree the order of checkpointing goes from parent to child
64
- * and so on - so saving the FD will not be overwritten
65
- */
66
- static int kfd_checkpoint_fd ;
67
-
68
61
static LIST_HEAD (update_vma_info_list );
69
62
70
63
size_t kfd_max_buffer_size ;
@@ -1050,28 +1043,34 @@ int restore_hsakmt_shared_mem(const uint64_t shared_mem_size, const uint32_t sha
1050
1043
return 0 ;
1051
1044
}
1052
1045
1053
- static int unpause_process (int fd )
1046
+ int amdgpu_unpause_processes (int pid )
1054
1047
{
1055
1048
int ret = 0 ;
1056
1049
struct kfd_ioctl_criu_args args = { 0 };
1050
+ struct list_head * l = get_dumped_fds ();
1051
+ struct dumped_fd * st ;
1057
1052
1058
- args .op = KFD_CRIU_OP_UNPAUSE ;
1053
+ list_for_each_entry (st , l , l ) {
1054
+ if (st -> is_drm ) {
1055
+ close (st -> fd );
1056
+ } else {
1057
+ args .op = KFD_CRIU_OP_UNPAUSE ;
1059
1058
1060
- ret = kmtIoctl (fd , AMDKFD_IOC_CRIU_OP , & args );
1061
- if (ret ) {
1062
- pr_perror ("Failed to unpause process" );
1063
- goto exit ;
1059
+ ret = kmtIoctl (st -> fd , AMDKFD_IOC_CRIU_OP , & args );
1060
+ if (ret ) {
1061
+ pr_perror ("Failed to unpause process" );
1062
+ goto exit ;
1063
+ }
1064
+ }
1064
1065
}
1065
1066
1066
- // Reset the KFD FD
1067
- kfd_checkpoint_fd = -1 ;
1068
- sys_close_drm_render_devices (& src_topology );
1069
-
1070
1067
exit :
1071
1068
pr_info ("Process unpaused %s (ret:%d)\n" , ret ? "Failed" : "Ok" , ret );
1069
+ clear_dumped_fds ();
1072
1070
1073
1071
return ret ;
1074
1072
}
1073
+ CR_PLUGIN_REGISTER_HOOK (CR_PLUGIN_HOOK__DUMP_DEVICES_LATE , amdgpu_unpause_processes )
1075
1074
1076
1075
int store_dmabuf_fd (int handle , int fd )
1077
1076
{
@@ -1404,9 +1403,6 @@ int amdgpu_plugin_dump_file(int fd, int id)
1404
1403
return -1 ;
1405
1404
}
1406
1405
1407
- /* Initialize number of device files that will be checkpointed */
1408
- init_gpu_count (& src_topology );
1409
-
1410
1406
/* Check whether this plugin was called for kfd or render nodes */
1411
1407
if (major (st .st_rdev ) != major (st_kfd .st_rdev ) || minor (st .st_rdev ) != 0 ) {
1412
1408
@@ -1418,11 +1414,9 @@ int amdgpu_plugin_dump_file(int fd, int id)
1418
1414
if (ret )
1419
1415
return ret ;
1420
1416
1421
- /* Invoke unpause process if needed */
1422
- decrement_checkpoint_count ();
1423
- if (checkpoint_is_complete ()) {
1424
- ret = unpause_process (kfd_checkpoint_fd );
1425
- }
1417
+ ret = record_dumped_fd (fd , true);
1418
+ if (ret )
1419
+ return ret ;
1426
1420
1427
1421
/* Need to return success here so that criu can call plugins for renderD nodes */
1428
1422
return ret ;
@@ -1520,14 +1514,11 @@ int amdgpu_plugin_dump_file(int fd, int id)
1520
1514
1521
1515
xfree (buf );
1522
1516
1523
- exit :
1524
- /* Restore all queues if conditions permit */
1525
- kfd_checkpoint_fd = fd ;
1526
- decrement_checkpoint_count ();
1527
- if (checkpoint_is_complete ()) {
1528
- ret = unpause_process (fd );
1529
- }
1517
+ ret = record_dumped_fd (fd , false);
1518
+ if (ret )
1519
+ goto exit ;
1530
1520
1521
+ exit :
1531
1522
xfree ((void * )args .devices );
1532
1523
xfree ((void * )args .bos );
1533
1524
xfree ((void * )args .priv_data );
0 commit comments