Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions Makefile
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
CC := gcc
SRCEXT := c
HEADEREXT := h
CFLAGS := -Wall -Werror -Wmissing-prototypes -std=c11 -O2

DEBUG_FLAGS := -g -DDEBUG
CFLAGS := -Wall -Wmissing-prototypes -Wno-unused-result -std=c11 -O2 $(DEBUG_FLAGS) -fopenmp

NVCC := /usr/local/cuda/bin/nvcc
CUDAFLAGS :=
Expand All @@ -10,8 +12,10 @@ NVSRCEXT := cu
NVHEADEREXT := h

TARGET = ConjugateGradient
TARGET_DEBUG = debug_ConjugateGradient
SRCDIR := src
BUILDDIR := build
BUILDDIR_DEBUG = build_debug

SOURCES := $(shell find $(SRCDIR) -type f -name *.$(SRCEXT))
HEADERS = $(wildcard $(SRCDIR)/*.$(HEADEREXT))
Expand All @@ -35,23 +39,25 @@ LIBRARY_DIRS := $(MKLROOT)/lib/intel64
LFLAGS := $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
CFLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
CFLAGS += -DMKL_ILP64 -m64 -I${MKLROOT}/include

# static liking to MKL
LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS)/libmkl_intel_thread.a $(LIBRARY_DIRS)/libmkl_core.a -Wl,--end-group
LDFLAGS += -liomp5 -lpthread -lm -ldl
LDFLAGS += -liomp5 -lpthread -lm -ldl -lstdc++

# add cuda flags
# -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc
CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64
CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64 $(DEBUG_FLAGS)

$(TARGET): $(OBJECTS)
@echo " Linking..."
$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS)

$(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT)
@mkdir -p $(@D)
$(NVCC) $(CUDAFLAGS) -c -o $@ $<

$(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT)
@mkdir -p $(BUILDDIR)
@mkdir -p $(@D)
$(CC) $(CFLAGS) $(INC) -c -o $@ $<

valgrind:
Expand All @@ -60,8 +66,9 @@ valgrind:
clean:
@echo " Cleaning...";
@echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET)
@echo " $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)"; $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)

cuda_info:
nvidia-smi -a

.PHONY: default all clean
.PHONY: default all clean debug cuda_info
Empty file modified README.rst
100644 → 100755
Empty file.
Empty file modified doc/arn_matrix.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified doc/cg_conv_visual.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified doc/comparison.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified doc/crn_matrix.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file modified doc/rnqa_matrix.png
100644 → 100755
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
11 changes: 8 additions & 3 deletions src/cg_solver.c → src/CPU/cg_solver.c
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <stdlib.h>
#include "cg_solver.h"
#include "mkl.h"
#include "../misc/logger.h"
#include "../misc/math_functions.h"

#define FREE_STACK \
mkl_free(Ax);\
Expand All @@ -22,7 +24,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
* */

int k, max_iter;
double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old;
double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old, norm;
double *Ax, *x, *d, *q;
k = 0;
beta = 0.0;
Expand All @@ -33,21 +35,23 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
x = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
q = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
Ax = (double *)mkl_malloc(matrix->size * sizeof(double), 64);

/*Calculate Ax matrix*/
mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, x_vec, Ax);
norm = euclidean_norm(matrix->size, Ax); logger(LEVEL_DEBUG, "x_vec norm value: %f", norm);
/*Calculate rhs=rhs-Ax matrix*/
cblas_daxpy(matrix->size, -1.0, Ax, 1 , rhs, 1);
/*CG: Copy updated rhs (residuum) to d vector*/
cblas_dcopy(matrix->size, rhs, 1, d, 1);
norm = euclidean_norm(sizeof(double), d); logger(LEVEL_DEBUG, "d norm value: %f", norm);
/*CG: calculate dot r'*r, assign it to dot_new */
dot_new = cblas_ddot(matrix->size, rhs, 1 ,rhs, 1);
norm = euclidean_norm(sizeof(double), &dot_new); logger(LEVEL_DEBUG, "dot_new norm value: %f", dot_new);
/*assign dot_new to dot_zero*/
dot_zero = dot_new;
while (dot_new > tol * tol * dot_zero && k < max_iter) {
/*Calculate q=A*d vector*/
mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, d, q);
/*Calculate alpha:*/
/*Calculate alpha = dot_new/dTq */
dTq = cblas_ddot(matrix->size, d, 1, q, 1);
alpha = dot_new / dTq;
/*Calculate x=x+alpha*d*/
Expand All @@ -58,6 +62,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
dot_old = dot_new;
/*CG:Assign dot_new = r'*r*/
dot_new = cblas_ddot(matrix->size, rhs, 1, rhs, 1);
/* beta = d_new/d_old */
beta = dot_new / dot_old;
/*Scale beta*d*/
cblas_dscal(matrix->size, beta, d, 1);
Expand Down
2 changes: 1 addition & 1 deletion src/cg_solver.h → src/CPU/cg_solver.h
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* Header file for Conjugate Gradient method */

#include "utils.h"
#include "../misc/utils.h"

int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec);
3 changes: 2 additions & 1 deletion src/pcg_solver.c → src/CPU/pcg_solver.c
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdlib.h>
#include "pcg_solver.h"
#include "mkl.h"
#include "../misc/logger.h"

#define FREE_STACK mkl_free(Ax);\
mkl_free(q);\
Expand Down Expand Up @@ -52,7 +53,7 @@ double* perform_preconditioning(Matrix *matrix, double *vec, pc_cfg *pc_config){
res[i] = vec[i] * pc_config->inverted_diagonal[i];
}
} else {
printf("No such preconditioner: %s, exiting", pc_config->preconditioner);
logger(LEVEL_ERROR, "No such preconditioner: %s, exiting", pc_config->preconditioner);
exit(1);
}
return res;
Expand Down
2 changes: 1 addition & 1 deletion src/pcg_solver.h → src/CPU/pcg_solver.h
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/* Header file for Preconditioned Conjugate Gradient method */

#include "utils.h"
#include "../misc/utils.h"

typedef struct {
char *preconditioner;
Expand Down
179 changes: 179 additions & 0 deletions src/GPU/cg_solver_gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*Contains implementation for gpu_cg_solver functions.*/

#include <stdio.h>
#include <stdint.h>
#include "ckernels.h"
#include "mkl.h"

extern "C"
{
#include "../misc/utils.h"
#include "gpu_utils.h"
#include "cg_solver_gpu.h"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include "../misc/logger.h"
#include "../misc/math_functions.h"
}

#define threadsPerBlock 128
#define CHECK_FOR_STATUS(status) logger(LEVEL_DEBUG, "cublas status = %s", cublasGetErrorString(status))

#define COPY_TO_DEVICE(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyHostToDevice)
#define COPY_TO_HOST(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyDeviceToHost)

#define FREE_DEVICE_STACK \
cudaFree(d_r);\
cudaFree(d_helper);\
cudaFree(d_x);\
cudaFree(d_rhs);\
cudaFree(d_d);\
cudaFree(d_Ax);\
cudaFree(d_q);\
cudaFree(d_val);\
cudaFree(d_I);\
cudaFree(d_J);\
cudaFree(d_beta);\
cudaFree(d_alfa);\
cudaFree(d_alpha_zero);\
cudaFree(d_dot_new);\
cudaFree(d_norm);\
cudaFree(d_dot_zero);\
cudaFree(d_dot_old);\
cudaFree(d_dTq);


int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
/*Single GPU CG solver using (mostly) cublas and custom kernels*/
int *d_I = NULL, *d_J = NULL;
int k, max_iter;
double *d_beta, *d_alpha_zero, *d_alfa;
double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
const double tol = 1e-2f;

double h_dot = 0;
double h_dot_zero = 0;
double alpha = 1.0;
double alpham1 = -1.0;
double beta = 0.0;

d_alpha_zero = &alpham1;

k = 0;
max_iter = 200;

size_t size = matrix->size * sizeof(double);
size_t d_size = sizeof(double);

cusparseHandle_t cusparseHandle = 0;
cusparseCreate(&cusparseHandle);

cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);

cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

logger(LEVEL_DEBUG, "%s", "Mallocing CUDA device memory");
cudaMalloc((double **)&d_r, size);
cudaMalloc((double **)&d_helper, size);
cudaMalloc((double **)&d_x, size);
cudaMalloc((double **)&d_rhs, size);
cudaMalloc((double **)&d_d, size);
cudaMalloc((double **)&d_Ax, size);
cudaMalloc((double **)&d_q, size);
cudaMalloc((double **)&d_alpha_zero, size);

cudaMalloc((double **)&d_val, matrix->non_zero * sizeof(double));
cudaMalloc((int **)&d_J, matrix->non_zero * sizeof(int));
cudaMalloc((int **)&d_I, (matrix->size + 1) * sizeof(int));

cudaMalloc((double **)&d_dTq, d_size);
cudaMalloc((double **)&d_beta, d_size);
cudaMalloc((double **)&d_alfa, d_size);
cudaMalloc((double **)&d_dot_new, d_size);
cudaMalloc((double **)&d_dot_zero, d_size);
cudaMalloc((double **)&d_norm, d_size);

logger(LEVEL_DEBUG, "%s", "Copying to device.");
COPY_TO_DEVICE(d_val, matrix->val, matrix->non_zero * sizeof(double));
COPY_TO_DEVICE(d_J, matrix->J_row, matrix->non_zero * sizeof(int));
COPY_TO_DEVICE(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int));
COPY_TO_DEVICE(d_x, x_vec, matrix->size * sizeof(double));
COPY_TO_DEVICE(d_rhs, rhs, matrix->size * sizeof(double));

int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
while (blocksPerGrid % threadsPerBlock != 0){
blocksPerGrid++;
}

/*Calculate Ax matrix*/
cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
matrix->size, matrix->size, matrix->non_zero, &alpha, descr, d_val, d_I, d_J, d_x, &beta, d_Ax);

/*Calculate rhs=rhs-Ax matrix*/
cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alpha_zero, d_Ax, d_rhs);

/*CG: Copy updated rhs (residuum) to d vector*/
copy_device_vectors<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_d, d_rhs);

/*CG: calculate dot r'*r, assign it to dot_new */
cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_rhs, d_rhs, d_helper);
cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dot_new, blocksPerGrid);

/*assign dot_new to dot_zero*/
copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_zero, d_dot_new);

COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double));
/*assign dot_new to dot_zero*/
h_dot_zero = h_dot;
printf("tol * tol * h_dot_zero = %f\n", tol * tol * h_dot_zero);
printf("h_dot = %f\n", h_dot);
while (h_dot > tol * tol * h_dot_zero && k < max_iter) {
/*Calculate q=A*d vector*/
cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
&alpha, descr, d_val, d_I, d_J, d_d, &beta, d_q);

/*Calculate dTq = d'*q*/
cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_d, d_q, d_helper);
/*Calculate alpha = dot_new/dTq*/
cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dTq, blocksPerGrid);
cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);

double tmp;
double *p_tmp;
p_tmp = &tmp;
COPY_TO_HOST(p_tmp, d_dot_new, sizeof(double));
logger(LEVEL_DEBUG, "d_dot_new val = %f", tmp);
COPY_TO_HOST(p_tmp, d_dTq, sizeof(double));
logger(LEVEL_DEBUG, "d_dTq val = %f", tmp);
COPY_TO_HOST(p_tmp, d_alfa, sizeof(double));
logger(LEVEL_DEBUG, "d_alfa val = %f", tmp);

/*Calculate x=x+alpha*d*/
cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alfa, d_d, d_x);

/*Calculate r=r-alpha*q*/
cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alpha_zero, d_q, d_rhs);
/*Assign dot_old = dot_new*/
copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_old, d_dot_new);

/*Calculate dot_new = r'*r*/
cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_rhs, d_rhs, d_helper);
cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dot_new, blocksPerGrid);

/*Calculate beta = d_new/d_old */
cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);

/*CG:Calculate d=r+beta*d*/
cudaDaxpyz<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_beta, d_d, d_d, d_r);
COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double));
cudaThreadSynchronize();
k++;
}
cusparseDestroy(cusparseHandle);
cudaDeviceReset();
FREE_DEVICE_STACK
return k;
}
10 changes: 10 additions & 0 deletions src/GPU/cg_solver_gpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
/*Contains prototypes for gpu_cg_solver functions.*/

#ifndef CG_SOLVER_GPU_H
#define CG_SOLVER_GPU_H
#include "../misc/utils.h"
#include "gpu_utils.h"

int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);

#endif
Loading