1
1
/* Contains implementation for gpu_cg_solver functions.*/
2
2
3
-
4
- #include " cg_colver_gpu.h"
3
+ #include < stdio.h>
5
4
#include " ckernels.h"
6
5
6
+
7
7
extern " C"
8
8
{
9
- #include " gpu_utils.h"
10
9
#include " utils.h"
11
-
10
+ #include " gpu_utils.h"
11
+ #include " cg_colver_gpu.h"
12
12
#include < cuda_runtime.h>
13
13
#include < cusparse_v2.h>
14
14
#include < cublas_v2.h>
15
+ }
15
16
16
17
#define threadsPerBlock 256
18
+ #define CHECK_FOR_STATUS (status ) printf (" cublas status = %s\n " , cublasGetErrorString(status))
17
19
18
20
#define FREE_DEVICE_STACK \
19
21
cudaFree (d_r);\
@@ -29,21 +31,20 @@ extern "C"
29
31
cudaFree (d_beta);\
30
32
cudaFree (d_alfa);\
31
33
cudaFree (d_alpha_zero);\
32
- cudaFree (d_dot );\
34
+ cudaFree (d_dot_new );\
33
35
cudaFree (d_norm);\
34
36
cudaFree (d_dot_zero);\
35
37
cudaFree (d_dot_old);\
36
38
cudaFree (d_dTq);
37
39
38
40
39
- int gpu_conjugate_gradient_solver (Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data gpu_data){
41
+ int gpu_conjugate_gradient_solver (Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data * gpu_data){
40
42
/* Single GPU CG solver using cublas*/
41
-
42
43
double *h_dot, *h_dot_zero;
43
44
int *d_I = NULL , *d_J = NULL ;
44
45
const double tol = 1e-2f ;
45
46
double *d_alfa, *d_beta, *d_alpha_zero;
46
- double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot , *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
47
+ double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new , *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
47
48
int k, max_iter;
48
49
49
50
k = 0 ;
@@ -52,41 +53,55 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
52
53
max_iter = 200 ;
53
54
54
55
size_t size = matrix->size * sizeof (double );
56
+ size_t d_size = sizeof (double );
55
57
56
58
cusparseHandle_t cusparseHandle = 0 ;
57
- cublasHandle_t cublasHandle = 0 ;
59
+ cusparseCreate (&cusparseHandle);
60
+
58
61
cusparseMatDescr_t descr = 0 ;
62
+ cusparseCreateMatDescr (&descr);
63
+
64
+ cublasHandle_t cublasHandle = 0 ;
65
+ cublasCreate (&cublasHandle);
59
66
60
67
cusparseSetMatType (descr, CUSPARSE_MATRIX_TYPE_GENERAL);
61
68
cusparseSetMatIndexBase (descr, CUSPARSE_INDEX_BASE_ZERO);
62
69
70
+ cublasStatus_t cublasStatus;
71
+
72
+ printf (" Mallocing CUDA divice memory\n " );
63
73
cudaMalloc ((void **)&d_r, size);
64
74
cudaMalloc ((void **)&d_helper, size);
65
75
cudaMalloc ((void **)&d_x, size);
66
76
cudaMalloc ((void **)&d_rhs, size);
67
77
cudaMalloc ((void **)&d_d, size);
68
78
cudaMalloc ((void **)&d_Ax, size);
69
79
cudaMalloc ((void **)&d_q, size);
70
-
71
80
cudaMalloc ((void **)&d_val, matrix->non_zero * sizeof (double ));
72
- cudaMalloc ((void **)&d_J, matrix->non_zero * sizeof (double ));
81
+ cudaMalloc ((void **)&d_J, matrix->non_zero * sizeof (int ));
73
82
cudaMalloc ((void **)&d_I, (matrix->size + 1 ) * sizeof (int ));
74
83
75
- cudaMalloc ((void **)&d_beta, sizeof (double ));
76
- cudaMalloc ((void **)&d_alfa, sizeof (double ));
77
- cudaMalloc ((void **)&d_alpha_zero, sizeof (double ));
78
- cudaMalloc ((void **)&d_dot, sizeof (double ));
79
- cudaMalloc ((void **)&d_dot_zero, sizeof (double ));
80
- cudaMalloc ((void **)&d_norm, sizeof (double ));
81
-
84
+ cudaMalloc ((void **)&d_beta, d_size);
85
+ cudaMalloc ((void **)&d_alfa, d_size);
86
+ cudaMalloc ((void **)&d_alpha_zero, d_size);
87
+ cudaMalloc ((void **)&d_dot_new, d_size);
88
+ cudaMalloc ((void **)&d_dot_zero, d_size);
89
+ cudaMalloc ((void **)&d_norm, d_size);
90
+
91
+ cudaMemset (d_beta, 0 , d_size);
92
+ cudaMemset (d_alfa, 0 , d_size);
93
+ cudaMemset (d_alpha_zero, 0 , d_size);
94
+ cudaMemset (d_dot_new, 0 , d_size);
95
+ cudaMemset (d_dot_zero, 0 , d_size);
96
+ cudaMemset (d_norm, 0 , d_size);
97
+
98
+ printf (" Copying to device\n " );
82
99
cudaMemcpy (d_val, matrix->val , matrix->non_zero * sizeof (double ), cudaMemcpyHostToDevice);
83
100
cudaMemcpy (d_J, matrix->J_row , matrix->non_zero * sizeof (int ), cudaMemcpyHostToDevice);
84
101
cudaMemcpy (d_I, matrix->I_column , (matrix->size + 1 ) * sizeof (int ), cudaMemcpyHostToDevice);
85
-
86
102
cudaMemcpy (d_x, x_vec, size, cudaMemcpyHostToDevice);
87
- cudaMemcpy (d_rhs, rhs, size, cudaMemcpyHostToDevice);
88
103
89
- int blocksPerGrid = ((matrix->size + threadsPerBlock -1 ) / threadsPerBlock );
104
+ int blocksPerGrid = ((matrix->size + threadsPerBlock - 1 ) / threadsPerBlock );
90
105
while (blocksPerGrid % threadsPerBlock != 0 ){
91
106
blocksPerGrid++;
92
107
}
@@ -96,43 +111,60 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
96
111
const double one = 1.0 ;
97
112
const double minus_one = -1.0 ;
98
113
/* Calculate Ax matrix*/
114
+
99
115
cusparseDcsrmv (cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size , matrix->size , matrix->non_zero ,
100
- &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
116
+ &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
101
117
/* Calculate rhs=rhs-Ax matrix*/
102
- cublasDaxpy (cublasHandle, matrix->size , &minus_one, d_Ax, 1 , d_rhs, 1 );
118
+ cublasStatus = cublasDaxpy (cublasHandle, matrix->size , &minus_one, d_Ax, 1 , d_rhs, 1 );
119
+ CHECK_FOR_STATUS (cublasStatus);
120
+
103
121
/* CG: Copy updated rhs (residuum) to d vector*/
104
- cublasDcopy (cublasHandle, matrix->size , d_d, 1 , d_rhs, 1 );
105
- /* CG: calculate dot r'*r, assign it to dot_new */
106
- cublasDdot (cublasHandle, matrix->size , d_rhs, 1 , d_rhs, 1 , d_dot);
122
+ cublasStatus = cublasDcopy (cublasHandle, matrix->size , d_d, 1 , d_rhs, 1 );
123
+ CHECK_FOR_STATUS (cublasStatus);
124
+
125
+ /* CG: calculate dot r'*r, assign it to d_dot_new */
126
+ cublasStatus = cublasDdot (cublasHandle, matrix->size , d_rhs, 1 , d_rhs, 1 , d_dot_new);
127
+ CHECK_FOR_STATUS (cublasStatus);
128
+
107
129
/* assign dot_new to dot_zero*/
108
- d_dot_zero = d_dot ;
109
- cudaMemcpy (h_dot, d_dot , sizeof (double ), cudaMemcpyDeviceToHost);
130
+ d_dot_zero = d_dot_new ;
131
+ cudaMemcpy (h_dot, d_dot_new , sizeof (double ), cudaMemcpyDeviceToHost);
110
132
cudaMemcpy (h_dot_zero, d_dot_zero, sizeof (double ), cudaMemcpyDeviceToHost);
111
133
while ((*h_dot > tol * tol * *h_dot_zero) && (k < max_iter)) {
112
134
/* Calculate q=A*d vector*/
113
135
cusparseDcsrmv (cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size , matrix->size , matrix->non_zero ,
114
136
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
115
137
/* Calculate alpha:*/
116
- cublasDdot (cublasHandle, matrix->size , d_d, 1 , d_q, 1 , d_dTq);
117
- sDdiv <<<1 , gpu_data.devices[0 ].warp_size>>> (d_alfa, d_dot, d_dTq);
138
+ cublasStatus = cublasDdot (cublasHandle, matrix->size , d_d, 1 , d_q, 1 , d_dTq);
139
+ CHECK_FOR_STATUS (cublasStatus);
140
+
141
+ sDdiv <<<1 , gpu_data->devices[0 ].warp_size>>> (d_alfa, d_dot_new, d_dTq);
118
142
/* Calculate x=x+alpha*d*/
119
- cublasDaxpy (cublasHandle, matrix->size , d_alfa, d_x, 1 , d_d, 1 );
143
+ cublasStatus = cublasDaxpy (cublasHandle, matrix->size , d_alfa, d_x, 1 , d_d, 1 );
144
+ CHECK_FOR_STATUS (cublasStatus);
145
+
120
146
/* Calculate r=r-alpha*q*/
121
147
axpy<<<blocksPerGrid, threadsPerBlock>>> (matrix->size , -1 , d_q, d_rhs);
122
148
/* Assign dot_old = dot_new*/
123
- cublasDcopy (cublasHandle, 1 , d_dot_old, 1 , d_dot, 1 );
149
+ cublasStatus = cublasDcopy (cublasHandle, 1 , d_dot_old, 1 , d_dot_new, 1 );
150
+ CHECK_FOR_STATUS (cublasStatus);
151
+
124
152
/* CG:Assign dot_new = r'*r*/
125
- cublasDdot (cublasHandle, matrix->size , d_rhs, 1 , d_rhs, 1 , d_dot);
126
- sDdiv <<<1 , gpu_data.devices[0 ].warp_size>>> (d_beta, d_dot, d_dot_old);
153
+ cublasStatus = cublasDdot (cublasHandle, matrix->size , d_rhs, 1 , d_rhs, 1 , d_dot_new);
154
+ CHECK_FOR_STATUS (cublasStatus);
155
+
156
+ sDdiv <<<1 , gpu_data->devices[0 ].warp_size>>> (d_beta, d_dot_new, d_dot_old);
127
157
/* Scale beta*d*/
128
- cublasDscal (cublasHandle, matrix->size , d_beta, d_d, 1 );
158
+ cublasStatus = cublasDscal (cublasHandle, matrix->size , d_beta, d_d, 1 );
159
+ CHECK_FOR_STATUS (cublasStatus);
160
+
129
161
/* CG:Calculate d=r+beta*d*/
130
- cublasDaxpy (cublasHandle, matrix->size , &one, d_rhs, 1 , d_d, 1 );
162
+ cublasStatus = cublasDaxpy (cublasHandle, matrix->size , &one, d_rhs, 1 , d_d, 1 );
163
+ CHECK_FOR_STATUS (cublasStatus);
131
164
k++;
132
165
}
133
166
cusparseDestroy (cusparseHandle);
134
167
cudaDeviceReset ();
135
168
FREE_DEVICE_STACK
136
169
return k;
137
170
}
138
- }
0 commit comments