p-sto · p-sto · Dec 22, 2018
diff --git a/Makefile b/Makefile
@@ -1,7 +1,9 @@
 CC := gcc
 SRCEXT := c
 HEADEREXT := h
-CFLAGS := -Wall -Werror -Wmissing-prototypes -std=c11 -O2
+
+DEBUG_FLAGS := -g -DDEBUG
+CFLAGS := -Wall -Wmissing-prototypes -Wno-unused-result -std=c11 -O2 $(DEBUG_FLAGS) -fopenmp
 
 NVCC := /usr/local/cuda/bin/nvcc
 CUDAFLAGS :=
@@ -10,8 +12,10 @@ NVSRCEXT := cu
 NVHEADEREXT := h
 
 TARGET = ConjugateGradient
+TARGET_DEBUG = debug_ConjugateGradient
 SRCDIR := src
 BUILDDIR := build
+BUILDDIR_DEBUG = build_debug
 
 SOURCES := $(shell find $(SRCDIR) -type f -name *.$(SRCEXT))
 HEADERS = $(wildcard $(SRCDIR)/*.$(HEADEREXT))
@@ -35,23 +39,25 @@ LIBRARY_DIRS := $(MKLROOT)/lib/intel64
 LFLAGS := $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CFLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CFLAGS += -DMKL_ILP64 -m64 -I${MKLROOT}/include
+
 # static liking to MKL
 LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS)/libmkl_intel_thread.a $(LIBRARY_DIRS)/libmkl_core.a -Wl,--end-group
-LDFLAGS +=  -liomp5 -lpthread -lm -ldl
+LDFLAGS +=  -liomp5 -lpthread -lm -ldl -lstdc++
 
 # add cuda flags
 # -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc
-CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64
+CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64 $(DEBUG_FLAGS)
 
 $(TARGET): $(OBJECTS)
 	@echo " Linking..."
 	$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS)
 
 $(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT)
+	@mkdir -p $(@D)
 	$(NVCC) $(CUDAFLAGS) -c -o $@ $<
 
 $(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT)
-	@mkdir -p $(BUILDDIR)
+	@mkdir -p $(@D)
 	$(CC) $(CFLAGS) $(INC) -c -o $@ $<
 
 valgrind:
@@ -60,8 +66,9 @@ valgrind:
 clean:
 	@echo " Cleaning...";
 	@echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET)
+	@echo " $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)"; $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)
 
 cuda_info:
 	nvidia-smi -a
 
-.PHONY: default all clean
+.PHONY: default all clean debug cuda_info
diff --git a/README.rst b/README.rst
diff --git a/doc/arn_matrix.png b/doc/arn_matrix.png
diff --git a/doc/cg_conv_visual.png b/doc/cg_conv_visual.png
diff --git a/doc/comparison.png b/doc/comparison.png
diff --git a/doc/crn_matrix.png b/doc/crn_matrix.png
diff --git a/doc/rnqa_matrix.png b/doc/rnqa_matrix.png
diff --git a/src/cg_solver.c → src/CPU/cg_solver.c b/src/cg_solver.c → src/CPU/cg_solver.c
@@ -8,6 +8,8 @@
 #include <stdlib.h>
 #include "cg_solver.h"
 #include "mkl.h"
+#include "../misc/logger.h"
+#include "../misc/math_functions.h"
 
 #define FREE_STACK \
 	mkl_free(Ax);\
@@ -22,7 +24,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
 	 * */
 
 	int k, max_iter;
-	double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old;
+	double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old, norm;
 	double *Ax, *x, *d, *q;
 	k = 0;
 	beta = 0.0;
@@ -33,21 +35,23 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
 	x = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
 	q = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
 	Ax = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
-
 	/*Calculate Ax matrix*/
 	mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, x_vec, Ax);
+	norm = euclidean_norm(matrix->size, Ax); logger(LEVEL_DEBUG, "x_vec norm value: %f", norm);
 	/*Calculate rhs=rhs-Ax matrix*/
 	cblas_daxpy(matrix->size, -1.0, Ax, 1 , rhs, 1);
 	/*CG: Copy updated rhs (residuum) to d vector*/
 	cblas_dcopy(matrix->size, rhs, 1, d, 1);
+    norm = euclidean_norm(sizeof(double), d); logger(LEVEL_DEBUG, "d norm value: %f", norm);
 	/*CG: calculate dot r'*r, assign it to dot_new */
 	dot_new = cblas_ddot(matrix->size, rhs, 1 ,rhs, 1);
+    norm = euclidean_norm(sizeof(double), &dot_new); logger(LEVEL_DEBUG, "dot_new norm value: %f", dot_new);
 	/*assign dot_new to dot_zero*/
 	dot_zero = dot_new;
 	while (dot_new >  tol * tol * dot_zero && k < max_iter) {
 		/*Calculate q=A*d vector*/
 		mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, d, q);
-		/*Calculate alpha:*/
+		/*Calculate alpha = dot_new/dTq */
 		dTq = cblas_ddot(matrix->size, d, 1, q, 1);
 		alpha = dot_new / dTq;
 		/*Calculate x=x+alpha*d*/
@@ -58,6 +62,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
 		dot_old = dot_new;
 		/*CG:Assign dot_new = r'*r*/
 		dot_new = cblas_ddot(matrix->size, rhs, 1, rhs, 1);
+		/* beta = d_new/d_old */
 		beta = dot_new / dot_old;
 		/*Scale beta*d*/
 		cblas_dscal(matrix->size, beta, d, 1);

diff --git a/src/cg_solver.h → src/CPU/cg_solver.h b/src/cg_solver.h → src/CPU/cg_solver.h
@@ -1,5 +1,5 @@
 /* Header file for Conjugate Gradient method */
 
-#include "utils.h"
+#include "../misc/utils.h"
 
 int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec);
diff --git a/src/pcg_solver.c → src/CPU/pcg_solver.c b/src/pcg_solver.c → src/CPU/pcg_solver.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include "pcg_solver.h"
 #include "mkl.h"
+#include "../misc/logger.h"
 
 #define FREE_STACK mkl_free(Ax);\
 	mkl_free(q);\
@@ -52,7 +53,7 @@ double* perform_preconditioning(Matrix *matrix, double *vec, pc_cfg *pc_config){
 			res[i] = vec[i] * pc_config->inverted_diagonal[i];
 		}
 	} else {
-		printf("No such preconditioner: %s, exiting", pc_config->preconditioner);
+		logger(LEVEL_ERROR, "No such preconditioner: %s, exiting", pc_config->preconditioner);
 		exit(1);
 	}
 	return res;

diff --git a/src/pcg_solver.h → src/CPU/pcg_solver.h b/src/pcg_solver.h → src/CPU/pcg_solver.h
@@ -1,6 +1,6 @@
 /* Header file for Preconditioned Conjugate Gradient method */
 
-#include "utils.h"
+#include "../misc/utils.h"
 
 typedef struct {
 	char *preconditioner;

diff --git a/src/GPU/cg_solver_gpu.cu b/src/GPU/cg_solver_gpu.cu
@@ -0,0 +1,179 @@
+/*Contains implementation for gpu_cg_solver functions.*/
+
+#include <stdio.h>
+#include <stdint.h>
+#include "ckernels.h"
+#include "mkl.h"
+
+extern "C"
+{
+#include "../misc/utils.h"
+#include "gpu_utils.h"
+#include "cg_solver_gpu.h"
+#include <cuda_runtime.h>
+#include <cusparse_v2.h>
+#include <cublas_v2.h>
+#include "../misc/logger.h"
+#include "../misc/math_functions.h"
+}
+
+#define threadsPerBlock 128
+#define CHECK_FOR_STATUS(status) logger(LEVEL_DEBUG, "cublas status = %s", cublasGetErrorString(status))
+
+#define COPY_TO_DEVICE(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyHostToDevice)
+#define COPY_TO_HOST(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyDeviceToHost)
+
+#define FREE_DEVICE_STACK \
+	cudaFree(d_r);\
+	cudaFree(d_helper);\
+	cudaFree(d_x);\
+	cudaFree(d_rhs);\
+	cudaFree(d_d);\
+	cudaFree(d_Ax);\
+	cudaFree(d_q);\
+	cudaFree(d_val);\
+	cudaFree(d_I);\
+	cudaFree(d_J);\
+	cudaFree(d_beta);\
+	cudaFree(d_alfa);\
+	cudaFree(d_alpha_zero);\
+	cudaFree(d_dot_new);\
+	cudaFree(d_norm);\
+	cudaFree(d_dot_zero);\
+	cudaFree(d_dot_old);\
+	cudaFree(d_dTq);
+
+
+int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
+	/*Single GPU CG solver using (mostly) cublas and custom kernels*/
+    int *d_I = NULL, *d_J = NULL;
+    int k, max_iter;
+	double *d_beta, *d_alpha_zero, *d_alfa;
+	double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
+    const double tol = 1e-2f;
+
+	double h_dot = 0;
+	double h_dot_zero = 0;
+	double alpha = 1.0;
+	double alpham1 = -1.0;
+	double beta = 0.0;
+
+    d_alpha_zero = &alpham1;
+
+    k = 0;
+	max_iter = 200;
+
+	size_t size = matrix->size * sizeof(double);
+	size_t d_size = sizeof(double);
+
+	cusparseHandle_t cusparseHandle = 0;
+	cusparseCreate(&cusparseHandle);
+
+	cusparseMatDescr_t descr = 0;
+	cusparseCreateMatDescr(&descr);
+
+	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
+
+	logger(LEVEL_DEBUG, "%s", "Mallocing CUDA device memory");
+	cudaMalloc((double **)&d_r, size);
+	cudaMalloc((double **)&d_helper, size);
+	cudaMalloc((double **)&d_x, size);
+	cudaMalloc((double **)&d_rhs,	size);
+	cudaMalloc((double **)&d_d, size);
+	cudaMalloc((double **)&d_Ax, size);
+	cudaMalloc((double **)&d_q, size);
+    cudaMalloc((double **)&d_alpha_zero, size);
+
+	cudaMalloc((double **)&d_val, matrix->non_zero * sizeof(double));
+	cudaMalloc((int **)&d_J, matrix->non_zero * sizeof(int));
+	cudaMalloc((int **)&d_I, (matrix->size + 1) * sizeof(int));
+
+    cudaMalloc((double **)&d_dTq, d_size);
+    cudaMalloc((double **)&d_beta, d_size);
+	cudaMalloc((double **)&d_alfa, d_size);
+	cudaMalloc((double **)&d_dot_new, d_size);
+	cudaMalloc((double **)&d_dot_zero, d_size);
+	cudaMalloc((double **)&d_norm, d_size);
+
+    logger(LEVEL_DEBUG, "%s", "Copying to device.");
+	COPY_TO_DEVICE(d_val, matrix->val, matrix->non_zero * sizeof(double));
+	COPY_TO_DEVICE(d_J, matrix->J_row, matrix->non_zero * sizeof(int));
+	COPY_TO_DEVICE(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int));
+	COPY_TO_DEVICE(d_x, x_vec, matrix->size * sizeof(double));
+	COPY_TO_DEVICE(d_rhs, rhs, matrix->size * sizeof(double));
+
+	int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
+	while (blocksPerGrid % threadsPerBlock != 0){
+		blocksPerGrid++;
+	}
+
+    /*Calculate Ax matrix*/
+	cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                   matrix->size, matrix->size, matrix->non_zero,  &alpha, descr, d_val, d_I, d_J, d_x, &beta, d_Ax);
+
+    /*Calculate rhs=rhs-Ax matrix*/
+    cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alpha_zero, d_Ax, d_rhs);
+
+    /*CG: Copy updated rhs (residuum) to d vector*/
+    copy_device_vectors<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_d, d_rhs);
+
+    /*CG: calculate dot r'*r, assign it to dot_new */
+	cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_rhs, d_rhs, d_helper);
+	cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dot_new, blocksPerGrid);
+
+    /*assign dot_new to dot_zero*/
+    copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_zero, d_dot_new);
+
+    COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double));
+    /*assign dot_new to dot_zero*/
+    h_dot_zero = h_dot;
+    printf("tol * tol * h_dot_zero = %f\n", tol * tol * h_dot_zero);
+    printf("h_dot = %f\n", h_dot);
+	while (h_dot >  tol * tol * h_dot_zero && k < max_iter) {
+		/*Calculate q=A*d vector*/
+		cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
+						  &alpha, descr, d_val, d_I, d_J, d_d, &beta, d_q);
+
+		/*Calculate dTq = d'*q*/
+        cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_d, d_q, d_helper);
+		/*Calculate alpha = dot_new/dTq*/
+        cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dTq, blocksPerGrid);
+        cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);
+
+        double tmp;
+        double *p_tmp;
+        p_tmp = &tmp;
+		COPY_TO_HOST(p_tmp, d_dot_new, sizeof(double));
+		logger(LEVEL_DEBUG, "d_dot_new val = %f", tmp);
+		COPY_TO_HOST(p_tmp, d_dTq, sizeof(double));
+		logger(LEVEL_DEBUG, "d_dTq val = %f", tmp);
+		COPY_TO_HOST(p_tmp, d_alfa, sizeof(double));
+		logger(LEVEL_DEBUG, "d_alfa val = %f", tmp);
+
+		/*Calculate x=x+alpha*d*/
+		cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alfa, d_d, d_x);
+
+		/*Calculate r=r-alpha*q*/
+		cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alpha_zero, d_q, d_rhs);
+		/*Assign dot_old = dot_new*/
+        copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_old, d_dot_new);
+
+		/*Calculate dot_new = r'*r*/
+        cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_rhs, d_rhs, d_helper);
+        cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dot_new, blocksPerGrid);
+
+        /*Calculate beta = d_new/d_old */
+        cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);
+
+		/*CG:Calculate d=r+beta*d*/
+		cudaDaxpyz<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_beta, d_d, d_d, d_r);
+        COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double));
+        cudaThreadSynchronize();
+		k++;
+	}
+	cusparseDestroy(cusparseHandle);
+	cudaDeviceReset();
+	FREE_DEVICE_STACK
+	return k;
+}
diff --git a/src/GPU/cg_solver_gpu.h b/src/GPU/cg_solver_gpu.h
@@ -0,0 +1,10 @@
+/*Contains prototypes for gpu_cg_solver functions.*/
+
+#ifndef CG_SOLVER_GPU_H
+#define CG_SOLVER_GPU_H
+#include "../misc/utils.h"
+#include "gpu_utils.h"
+
+int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);
+
+#endif