Skip to content

Commit 5b56eec

Browse files
committed
Debugging GPU implementation
1 parent e94ff3f commit 5b56eec

File tree

12 files changed

+171
-66
lines changed

12 files changed

+171
-66
lines changed

Makefile

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS
4040
LDFLAGS += -liomp5 -lpthread -lm -ldl
4141

4242
# add cuda flags
43-
LDFLAGS += -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse
43+
# -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc
44+
CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64
4445

4546
$(TARGET): $(OBJECTS)
4647
@echo " Linking..."
47-
$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS)
48+
$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS)
4849

4950
$(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT)
5051
$(NVCC) $(CUDAFLAGS) -c -o $@ $<
@@ -60,4 +61,7 @@ clean:
6061
@echo " Cleaning...";
6162
@echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET)
6263

64+
cuda_info:
65+
nvidia-smi -a
66+
6367
.PHONY: default all clean

README.rst

Lines changed: 49 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -128,39 +128,71 @@ Examples can be found in ``scripts/ConjugateGradients/demo.py``
128128
Required Python 3.5+
129129

130130

131-
C implementation - TBA
131+
CPU/GPU implementation
132132
----------------------
133133

134-
MKL implementation
135-
~~~~~~~~~~~~~~~~~~
134+
Libraries and compilation
135+
~~~~~~~~~~~~~~~~~~~~~~~~~
136136

137-
Required - Intel MKL library for BLAS operations. Implementation was tested on version 2017 though older should work as well.
138-
By default MKL will be installed in directory ``/opt/intel/mkl/``.
139-
140-
To compile:
137+
Before compiling code, make sure you have installed:
141138

142139
::
143140

144-
$ source mkl_setup.sh
145-
$ make
141+
1. Intel MKL library
142+
2. Nvidia CUDA with NVCC compiler
146143

147-
``mkl_setup.sh`` sources MKL env configuration.
144+
Intel MKL library is used for BLAS operations. Implementation was tested on version 2017 though older should work as well.
145+
By default MKL will be installed in directory ``/opt/intel/mkl/``. Before compiling make sure ``prepare_env.sh`` has proper
146+
paths to MKL and CUDA libraries.
147+
148+
In Makefile set accordingly:
148149

149150
::
150151

151-
If you have installed MKL in a different directory, then you will have to adjust mkl_setup.sh
152-
script to point to proper paths.
153-
Also please note that in makefile there is MKLROOT variable which points to MKL installation directory.
152+
1. MKLROOT
153+
2. NVCC
154+
3. CUDALIBPATH
154155

155-
By default MKL will be compiled as a static library.
156-
Since there are many dependencies it is good to set ``CFLAGS`` and ``LDFLAGS`` accordingly to MKL link line advisor:
156+
By default MKL will be compiled as a static library. CUDA is linked dynamically.
157+
``LDFLAGS`` are used to set dependencies for MKL, please refer to MKL link line advisor to be sure to have it set properly:
157158

158159
https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
159160

161+
``CUDAFLAGS`` are responsible for setting CUDA libraries.
162+
163+
``GCC`` is used for compiling .c files, ``NVCC`` is used for .cu files. Whole project is linked by ``GCC``.
164+
165+
To compile:
166+
167+
::
168+
169+
$ source prepare_env.sh
170+
$ make
171+
160172
Use ``make clean`` command to delete compiled build.
161173

162-
CUDA implementation - TBA
163-
-------------------------
174+
Running ConjugateGradient
175+
~~~~~~~~~~~~~~~~~~~~~~~~~
176+
177+
Running single core CPU MKL implementation:
178+
179+
``./ConjugateGradient -i input_matrix.txt``
180+
181+
Running multiple core CPU MKL implementation:
182+
183+
``./ConjugateGradient -i input_matrix.txt -mt 4``
184+
185+
Running GPU implementation (single device only available):
186+
187+
``./ConjugateGradient -i input_matrix.txt --gpu``
188+
189+
::
190+
191+
If there are no CUDA devices, CPU implementation will be launched.
192+
193+
input_matrix.txt is expected to be CSR formatted matrix, various examples can be generated by Python scripts.
194+
195+
164196

165197
Conjugate Gradients description
166198
-------------------------------

mkl_setup.sh

Lines changed: 0 additions & 2 deletions
This file was deleted.

prepare_env.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
source /opt/intel/mkl/bin/mklvars.sh intel64
2+
source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64
3+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
4+
export PATH=$PATH:/usr/local/cuda/bin

src/cg_colver_gpu.cu

Lines changed: 69 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
/*Contains implementation for gpu_cg_solver functions.*/
22

3-
4-
#include "cg_colver_gpu.h"
3+
#include <stdio.h>
54
#include "ckernels.h"
65

6+
77
extern "C"
88
{
9-
#include "gpu_utils.h"
109
#include "utils.h"
11-
10+
#include "gpu_utils.h"
11+
#include "cg_colver_gpu.h"
1212
#include <cuda_runtime.h>
1313
#include <cusparse_v2.h>
1414
#include <cublas_v2.h>
15+
}
1516

1617
#define threadsPerBlock 256
18+
#define CHECK_FOR_STATUS(status) printf("cublas status = %s\n", cublasGetErrorString(status))
1719

1820
#define FREE_DEVICE_STACK \
1921
cudaFree(d_r);\
@@ -29,21 +31,20 @@ extern "C"
2931
cudaFree(d_beta);\
3032
cudaFree(d_alfa);\
3133
cudaFree(d_alpha_zero);\
32-
cudaFree(d_dot);\
34+
cudaFree(d_dot_new);\
3335
cudaFree(d_norm);\
3436
cudaFree(d_dot_zero);\
3537
cudaFree(d_dot_old);\
3638
cudaFree(d_dTq);
3739

3840

39-
int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data gpu_data){
41+
int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
4042
/*Single GPU CG solver using cublas*/
41-
4243
double *h_dot, *h_dot_zero;
4344
int *d_I = NULL, *d_J = NULL;
4445
const double tol = 1e-2f;
4546
double *d_alfa, *d_beta, *d_alpha_zero;
46-
double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
47+
double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
4748
int k, max_iter;
4849

4950
k = 0;
@@ -52,41 +53,55 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
5253
max_iter = 200;
5354

5455
size_t size = matrix->size * sizeof(double);
56+
size_t d_size = sizeof(double);
5557

5658
cusparseHandle_t cusparseHandle = 0;
57-
cublasHandle_t cublasHandle = 0;
59+
cusparseCreate(&cusparseHandle);
60+
5861
cusparseMatDescr_t descr = 0;
62+
cusparseCreateMatDescr(&descr);
63+
64+
cublasHandle_t cublasHandle = 0;
65+
cublasCreate(&cublasHandle);
5966

6067
cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
6168
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
6269

70+
cublasStatus_t cublasStatus;
71+
72+
printf("Mallocing CUDA divice memory\n");
6373
cudaMalloc((void **)&d_r, size);
6474
cudaMalloc((void **)&d_helper, size);
6575
cudaMalloc((void **)&d_x, size);
6676
cudaMalloc((void **)&d_rhs, size);
6777
cudaMalloc((void **)&d_d, size);
6878
cudaMalloc((void **)&d_Ax, size);
6979
cudaMalloc((void **)&d_q, size);
70-
7180
cudaMalloc((void **)&d_val, matrix->non_zero * sizeof(double));
72-
cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(double));
81+
cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(int));
7382
cudaMalloc((void **)&d_I, (matrix->size + 1) * sizeof(int));
7483

75-
cudaMalloc((void **)&d_beta, sizeof(double));
76-
cudaMalloc((void **)&d_alfa, sizeof(double));
77-
cudaMalloc((void **)&d_alpha_zero, sizeof(double));
78-
cudaMalloc((void **)&d_dot, sizeof(double));
79-
cudaMalloc((void **)&d_dot_zero, sizeof(double));
80-
cudaMalloc((void **)&d_norm, sizeof(double));
81-
84+
cudaMalloc((void **)&d_beta, d_size);
85+
cudaMalloc((void **)&d_alfa, d_size);
86+
cudaMalloc((void **)&d_alpha_zero, d_size);
87+
cudaMalloc((void **)&d_dot_new, d_size);
88+
cudaMalloc((void **)&d_dot_zero, d_size);
89+
cudaMalloc((void **)&d_norm, d_size);
90+
91+
cudaMemset(d_beta, 0, d_size);
92+
cudaMemset(d_alfa, 0, d_size);
93+
cudaMemset(d_alpha_zero, 0, d_size);
94+
cudaMemset(d_dot_new, 0, d_size);
95+
cudaMemset(d_dot_zero, 0, d_size);
96+
cudaMemset(d_norm, 0, d_size);
97+
98+
printf("Copying to device\n");
8299
cudaMemcpy(d_val, matrix->val, matrix->non_zero * sizeof(double), cudaMemcpyHostToDevice);
83100
cudaMemcpy(d_J, matrix->J_row, matrix->non_zero * sizeof(int), cudaMemcpyHostToDevice);
84101
cudaMemcpy(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int), cudaMemcpyHostToDevice);
85-
86102
cudaMemcpy(d_x, x_vec, size, cudaMemcpyHostToDevice);
87-
cudaMemcpy(d_rhs, rhs, size, cudaMemcpyHostToDevice);
88103

89-
int blocksPerGrid = ((matrix->size + threadsPerBlock -1) / threadsPerBlock );
104+
int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
90105
while (blocksPerGrid % threadsPerBlock != 0){
91106
blocksPerGrid++;
92107
}
@@ -96,43 +111,60 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
96111
const double one = 1.0;
97112
const double minus_one = -1.0;
98113
/*Calculate Ax matrix*/
114+
99115
cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
100-
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
116+
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
101117
/*Calculate rhs=rhs-Ax matrix*/
102-
cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
118+
cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
119+
CHECK_FOR_STATUS(cublasStatus);
120+
103121
/*CG: Copy updated rhs (residuum) to d vector*/
104-
cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
105-
/*CG: calculate dot r'*r, assign it to dot_new */
106-
cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot);
122+
cublasStatus = cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
123+
CHECK_FOR_STATUS(cublasStatus);
124+
125+
/*CG: calculate dot r'*r, assign it to d_dot_new */
126+
cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
127+
CHECK_FOR_STATUS(cublasStatus);
128+
107129
/*assign dot_new to dot_zero*/
108-
d_dot_zero = d_dot;
109-
cudaMemcpy(h_dot, d_dot, sizeof(double), cudaMemcpyDeviceToHost);
130+
d_dot_zero = d_dot_new;
131+
cudaMemcpy(h_dot, d_dot_new, sizeof(double), cudaMemcpyDeviceToHost);
110132
cudaMemcpy(h_dot_zero, d_dot_zero, sizeof(double), cudaMemcpyDeviceToHost);
111133
while ((*h_dot > tol * tol * *h_dot_zero) && (k < max_iter)) {
112134
/*Calculate q=A*d vector*/
113135
cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
114136
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
115137
/*Calculate alpha:*/
116-
cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
117-
sDdiv<<<1, gpu_data.devices[0].warp_size>>>(d_alfa, d_dot, d_dTq);
138+
cublasStatus = cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
139+
CHECK_FOR_STATUS(cublasStatus);
140+
141+
sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);
118142
/*Calculate x=x+alpha*d*/
119-
cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
143+
cublasStatus = cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
144+
CHECK_FOR_STATUS(cublasStatus);
145+
120146
/*Calculate r=r-alpha*q*/
121147
axpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, -1, d_q, d_rhs);
122148
/*Assign dot_old = dot_new*/
123-
cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot, 1);
149+
cublasStatus = cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot_new, 1);
150+
CHECK_FOR_STATUS(cublasStatus);
151+
124152
/*CG:Assign dot_new = r'*r*/
125-
cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot);
126-
sDdiv<<<1, gpu_data.devices[0].warp_size>>>(d_beta, d_dot, d_dot_old);
153+
cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
154+
CHECK_FOR_STATUS(cublasStatus);
155+
156+
sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);
127157
/*Scale beta*d*/
128-
cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
158+
cublasStatus = cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
159+
CHECK_FOR_STATUS(cublasStatus);
160+
129161
/*CG:Calculate d=r+beta*d*/
130-
cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
162+
cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
163+
CHECK_FOR_STATUS(cublasStatus);
131164
k++;
132165
}
133166
cusparseDestroy(cusparseHandle);
134167
cudaDeviceReset();
135168
FREE_DEVICE_STACK
136169
return k;
137170
}
138-
}

src/cg_colver_gpu.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
/*Contains prototypes for gpu_cg_solver functions.*/
22

3+
#ifndef CG_SOLVER_GPU_H
4+
#define CG_SOLVER_GPU_H
35
#include "utils.h"
6+
#include "gpu_utils.h"
47

5-
int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec);
8+
int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);
9+
10+
#endif

src/ckernels.cu

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
11
/*Contains implementation of custom kernels for CUDA devices.*/
22

33
#include "ckernels.h"
4+
#include <cublas_v2.h>
5+
6+
const char* cublasGetErrorString(cublasStatus_t status)
7+
{
8+
switch(status)
9+
{
10+
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
11+
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
12+
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
13+
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
14+
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
15+
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
16+
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
17+
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
18+
}
19+
return "unknown error";
20+
}
421

522
__global__ void sDdiv(double *res, double *divided, double *divider) {
623
/*Division of scalar elements on a single CUDA thread*/
@@ -17,3 +34,4 @@ __global__ void axpy(int num_elements, double alpha, double *x, double *y) {
1734
y[i] = y[i] + alpha * x[i];
1835
}
1936
}
37+

src/ckernels.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
/*Contains prototypes of custom kernels for CUDA devices.*/
22

3+
#include <cublas_v2.h>
4+
5+
const char* cublasGetErrorString(cublasStatus_t status);
36
__global__ void sDdiv(double *res, double *divided, double *divider);
47
__global__ void axpy(int num_elements, double alpha, double *x, double *y);

src/gpu_utils.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
/*Contains implementation of gpu_utils functions and structs.*/
22

3+
#include <stdio.h>
4+
35
extern "C" {
46
#include <cuda_runtime.h>
57
#include "gpu_utils.h"
@@ -10,7 +12,11 @@ GPU_data *get_gpu_devices_data(){
1012
GPU_data *gpu_data;
1113
gpu_data = (GPU_data *)malloc(sizeof(GPU_data));
1214
gpu_data->devices_number = 0;
13-
cudaGetDeviceCount(&gpu_data->devices_number);
15+
cudaError_t device_error;
16+
device_error = cudaGetDeviceCount(&gpu_data->devices_number);
17+
if (device_error != cudaSuccess)
18+
printf("Error - could not read properly number of device, err=[%s] \n", cudaGetErrorString(device_error));
19+
1420
if (gpu_data->devices_number != 0){
1521
gpu_data->devices = (GPU_device *)malloc(gpu_data->devices_number * sizeof(GPU_device));
1622
for (int i = 0; i < gpu_data->devices_number; i ++){

0 commit comments

Comments
 (0)