diff --git a/Makefile b/Makefile old mode 100644 new mode 100755 index 1e8a99a..aa6dc15 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ CC := gcc SRCEXT := c HEADEREXT := h -CFLAGS := -Wall -Werror -Wmissing-prototypes -std=c11 -O2 + +DEBUG_FLAGS := -g -DDEBUG +CFLAGS := -Wall -Wmissing-prototypes -Wno-unused-result -std=c11 -O2 $(DEBUG_FLAGS) -fopenmp NVCC := /usr/local/cuda/bin/nvcc CUDAFLAGS := @@ -10,8 +12,10 @@ NVSRCEXT := cu NVHEADEREXT := h TARGET = ConjugateGradient +TARGET_DEBUG = debug_ConjugateGradient SRCDIR := src BUILDDIR := build +BUILDDIR_DEBUG = build_debug SOURCES := $(shell find $(SRCDIR) -type f -name *.$(SRCEXT)) HEADERS = $(wildcard $(SRCDIR)/*.$(HEADEREXT)) @@ -35,23 +39,25 @@ LIBRARY_DIRS := $(MKLROOT)/lib/intel64 LFLAGS := $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CFLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir)) CFLAGS += -DMKL_ILP64 -m64 -I${MKLROOT}/include + # static liking to MKL LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS)/libmkl_intel_thread.a $(LIBRARY_DIRS)/libmkl_core.a -Wl,--end-group -LDFLAGS += -liomp5 -lpthread -lm -ldl +LDFLAGS += -liomp5 -lpthread -lm -ldl -lstdc++ # add cuda flags # -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc -CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64 +CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64 $(DEBUG_FLAGS) $(TARGET): $(OBJECTS) @echo " Linking..." $(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS) $(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT) + @mkdir -p $(@D) $(NVCC) $(CUDAFLAGS) -c -o $@ $< $(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT) - @mkdir -p $(BUILDDIR) + @mkdir -p $(@D) $(CC) $(CFLAGS) $(INC) -c -o $@ $< valgrind: @@ -60,8 +66,9 @@ valgrind: clean: @echo " Cleaning..."; @echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET) + @echo " $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)"; $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG) cuda_info: nvidia-smi -a -.PHONY: default all clean \ No newline at end of file +.PHONY: default all clean debug cuda_info \ No newline at end of file diff --git a/README.rst b/README.rst old mode 100644 new mode 100755 diff --git a/doc/arn_matrix.png b/doc/arn_matrix.png old mode 100644 new mode 100755 diff --git a/doc/cg_conv_visual.png b/doc/cg_conv_visual.png old mode 100644 new mode 100755 diff --git a/doc/comparison.png b/doc/comparison.png old mode 100644 new mode 100755 diff --git a/doc/crn_matrix.png b/doc/crn_matrix.png old mode 100644 new mode 100755 diff --git a/doc/rnqa_matrix.png b/doc/rnqa_matrix.png old mode 100644 new mode 100755 diff --git a/src/cg_solver.c b/src/CPU/cg_solver.c old mode 100644 new mode 100755 similarity index 80% rename from src/cg_solver.c rename to src/CPU/cg_solver.c index c948c18..8909df6 --- a/src/cg_solver.c +++ b/src/CPU/cg_solver.c @@ -8,6 +8,8 @@ #include #include "cg_solver.h" #include "mkl.h" +#include "../misc/logger.h" +#include "../misc/math_functions.h" #define FREE_STACK \ mkl_free(Ax);\ @@ -22,7 +24,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double * */ int k, max_iter; - double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old; + double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old, norm; double *Ax, *x, *d, *q; k = 0; beta = 0.0; @@ -33,21 +35,23 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double x = (double *)mkl_malloc(matrix->size * sizeof(double), 64); q = (double *)mkl_malloc(matrix->size * sizeof(double), 64); Ax = (double *)mkl_malloc(matrix->size * sizeof(double), 64); - /*Calculate Ax matrix*/ mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, x_vec, Ax); + norm = euclidean_norm(matrix->size, Ax); logger(LEVEL_DEBUG, "x_vec norm value: %f", norm); /*Calculate rhs=rhs-Ax matrix*/ cblas_daxpy(matrix->size, -1.0, Ax, 1 , rhs, 1); /*CG: Copy updated rhs (residuum) to d vector*/ cblas_dcopy(matrix->size, rhs, 1, d, 1); + norm = euclidean_norm(sizeof(double), d); logger(LEVEL_DEBUG, "d norm value: %f", norm); /*CG: calculate dot r'*r, assign it to dot_new */ dot_new = cblas_ddot(matrix->size, rhs, 1 ,rhs, 1); + norm = euclidean_norm(sizeof(double), &dot_new); logger(LEVEL_DEBUG, "dot_new norm value: %f", dot_new); /*assign dot_new to dot_zero*/ dot_zero = dot_new; while (dot_new > tol * tol * dot_zero && k < max_iter) { /*Calculate q=A*d vector*/ mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, d, q); - /*Calculate alpha:*/ + /*Calculate alpha = dot_new/dTq */ dTq = cblas_ddot(matrix->size, d, 1, q, 1); alpha = dot_new / dTq; /*Calculate x=x+alpha*d*/ @@ -58,6 +62,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double dot_old = dot_new; /*CG:Assign dot_new = r'*r*/ dot_new = cblas_ddot(matrix->size, rhs, 1, rhs, 1); + /* beta = d_new/d_old */ beta = dot_new / dot_old; /*Scale beta*d*/ cblas_dscal(matrix->size, beta, d, 1); diff --git a/src/cg_solver.h b/src/CPU/cg_solver.h old mode 100644 new mode 100755 similarity index 84% rename from src/cg_solver.h rename to src/CPU/cg_solver.h index eb2bb18..3e5dc04 --- a/src/cg_solver.h +++ b/src/CPU/cg_solver.h @@ -1,5 +1,5 @@ /* Header file for Conjugate Gradient method */ -#include "utils.h" +#include "../misc/utils.h" int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec); diff --git a/src/pcg_solver.c b/src/CPU/pcg_solver.c old mode 100644 new mode 100755 similarity index 97% rename from src/pcg_solver.c rename to src/CPU/pcg_solver.c index 5824fc4..ca87f9f --- a/src/pcg_solver.c +++ b/src/CPU/pcg_solver.c @@ -9,6 +9,7 @@ #include #include "pcg_solver.h" #include "mkl.h" +#include "../misc/logger.h" #define FREE_STACK mkl_free(Ax);\ mkl_free(q);\ @@ -52,7 +53,7 @@ double* perform_preconditioning(Matrix *matrix, double *vec, pc_cfg *pc_config){ res[i] = vec[i] * pc_config->inverted_diagonal[i]; } } else { - printf("No such preconditioner: %s, exiting", pc_config->preconditioner); + logger(LEVEL_ERROR, "No such preconditioner: %s, exiting", pc_config->preconditioner); exit(1); } return res; diff --git a/src/pcg_solver.h b/src/CPU/pcg_solver.h old mode 100644 new mode 100755 similarity index 94% rename from src/pcg_solver.h rename to src/CPU/pcg_solver.h index e19f6e6..e08e65c --- a/src/pcg_solver.h +++ b/src/CPU/pcg_solver.h @@ -1,6 +1,6 @@ /* Header file for Preconditioned Conjugate Gradient method */ -#include "utils.h" +#include "../misc/utils.h" typedef struct { char *preconditioner; diff --git a/src/GPU/cg_solver_gpu.cu b/src/GPU/cg_solver_gpu.cu new file mode 100755 index 0000000..c76da11 --- /dev/null +++ b/src/GPU/cg_solver_gpu.cu @@ -0,0 +1,179 @@ +/*Contains implementation for gpu_cg_solver functions.*/ + +#include +#include +#include "ckernels.h" +#include "mkl.h" + +extern "C" +{ +#include "../misc/utils.h" +#include "gpu_utils.h" +#include "cg_solver_gpu.h" +#include +#include +#include +#include "../misc/logger.h" +#include "../misc/math_functions.h" +} + +#define threadsPerBlock 128 +#define CHECK_FOR_STATUS(status) logger(LEVEL_DEBUG, "cublas status = %s", cublasGetErrorString(status)) + +#define COPY_TO_DEVICE(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyHostToDevice) +#define COPY_TO_HOST(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyDeviceToHost) + +#define FREE_DEVICE_STACK \ + cudaFree(d_r);\ + cudaFree(d_helper);\ + cudaFree(d_x);\ + cudaFree(d_rhs);\ + cudaFree(d_d);\ + cudaFree(d_Ax);\ + cudaFree(d_q);\ + cudaFree(d_val);\ + cudaFree(d_I);\ + cudaFree(d_J);\ + cudaFree(d_beta);\ + cudaFree(d_alfa);\ + cudaFree(d_alpha_zero);\ + cudaFree(d_dot_new);\ + cudaFree(d_norm);\ + cudaFree(d_dot_zero);\ + cudaFree(d_dot_old);\ + cudaFree(d_dTq); + + +int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){ + /*Single GPU CG solver using (mostly) cublas and custom kernels*/ + int *d_I = NULL, *d_J = NULL; + int k, max_iter; + double *d_beta, *d_alpha_zero, *d_alfa; + double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val; + const double tol = 1e-2f; + + double h_dot = 0; + double h_dot_zero = 0; + double alpha = 1.0; + double alpham1 = -1.0; + double beta = 0.0; + + d_alpha_zero = &alpham1; + + k = 0; + max_iter = 200; + + size_t size = matrix->size * sizeof(double); + size_t d_size = sizeof(double); + + cusparseHandle_t cusparseHandle = 0; + cusparseCreate(&cusparseHandle); + + cusparseMatDescr_t descr = 0; + cusparseCreateMatDescr(&descr); + + cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); + + logger(LEVEL_DEBUG, "%s", "Mallocing CUDA device memory"); + cudaMalloc((double **)&d_r, size); + cudaMalloc((double **)&d_helper, size); + cudaMalloc((double **)&d_x, size); + cudaMalloc((double **)&d_rhs, size); + cudaMalloc((double **)&d_d, size); + cudaMalloc((double **)&d_Ax, size); + cudaMalloc((double **)&d_q, size); + cudaMalloc((double **)&d_alpha_zero, size); + + cudaMalloc((double **)&d_val, matrix->non_zero * sizeof(double)); + cudaMalloc((int **)&d_J, matrix->non_zero * sizeof(int)); + cudaMalloc((int **)&d_I, (matrix->size + 1) * sizeof(int)); + + cudaMalloc((double **)&d_dTq, d_size); + cudaMalloc((double **)&d_beta, d_size); + cudaMalloc((double **)&d_alfa, d_size); + cudaMalloc((double **)&d_dot_new, d_size); + cudaMalloc((double **)&d_dot_zero, d_size); + cudaMalloc((double **)&d_norm, d_size); + + logger(LEVEL_DEBUG, "%s", "Copying to device."); + COPY_TO_DEVICE(d_val, matrix->val, matrix->non_zero * sizeof(double)); + COPY_TO_DEVICE(d_J, matrix->J_row, matrix->non_zero * sizeof(int)); + COPY_TO_DEVICE(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int)); + COPY_TO_DEVICE(d_x, x_vec, matrix->size * sizeof(double)); + COPY_TO_DEVICE(d_rhs, rhs, matrix->size * sizeof(double)); + + int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock ); + while (blocksPerGrid % threadsPerBlock != 0){ + blocksPerGrid++; + } + + /*Calculate Ax matrix*/ + cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, + matrix->size, matrix->size, matrix->non_zero, &alpha, descr, d_val, d_I, d_J, d_x, &beta, d_Ax); + + /*Calculate rhs=rhs-Ax matrix*/ + cudaDaxpy<<>>(matrix->size, d_alpha_zero, d_Ax, d_rhs); + + /*CG: Copy updated rhs (residuum) to d vector*/ + copy_device_vectors<<>>(matrix->size, d_d, d_rhs); + + /*CG: calculate dot r'*r, assign it to dot_new */ + cudaDElementWiseMult<<>>(matrix->size, d_rhs, d_rhs, d_helper); + cudaDvect_sum<<>>(d_helper, d_dot_new, blocksPerGrid); + + /*assign dot_new to dot_zero*/ + copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_zero, d_dot_new); + + COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double)); + /*assign dot_new to dot_zero*/ + h_dot_zero = h_dot; + printf("tol * tol * h_dot_zero = %f\n", tol * tol * h_dot_zero); + printf("h_dot = %f\n", h_dot); + while (h_dot > tol * tol * h_dot_zero && k < max_iter) { + /*Calculate q=A*d vector*/ + cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero, + &alpha, descr, d_val, d_I, d_J, d_d, &beta, d_q); + + /*Calculate dTq = d'*q*/ + cudaDElementWiseMult<<>>(matrix->size, d_d, d_q, d_helper); + /*Calculate alpha = dot_new/dTq*/ + cudaDvect_sum<<>>(d_helper, d_dTq, blocksPerGrid); + cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq); + + double tmp; + double *p_tmp; + p_tmp = &tmp; + COPY_TO_HOST(p_tmp, d_dot_new, sizeof(double)); + logger(LEVEL_DEBUG, "d_dot_new val = %f", tmp); + COPY_TO_HOST(p_tmp, d_dTq, sizeof(double)); + logger(LEVEL_DEBUG, "d_dTq val = %f", tmp); + COPY_TO_HOST(p_tmp, d_alfa, sizeof(double)); + logger(LEVEL_DEBUG, "d_alfa val = %f", tmp); + + /*Calculate x=x+alpha*d*/ + cudaDaxpy<<>>(matrix->size, d_alfa, d_d, d_x); + + /*Calculate r=r-alpha*q*/ + cudaDaxpy<<>>(matrix->size, d_alpha_zero, d_q, d_rhs); + /*Assign dot_old = dot_new*/ + copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_old, d_dot_new); + + /*Calculate dot_new = r'*r*/ + cudaDElementWiseMult<<>>(matrix->size, d_rhs, d_rhs, d_helper); + cudaDvect_sum<<>>(d_helper, d_dot_new, blocksPerGrid); + + /*Calculate beta = d_new/d_old */ + cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old); + + /*CG:Calculate d=r+beta*d*/ + cudaDaxpyz<<>>(matrix->size, d_beta, d_d, d_d, d_r); + COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double)); + cudaThreadSynchronize(); + k++; + } + cusparseDestroy(cusparseHandle); + cudaDeviceReset(); + FREE_DEVICE_STACK + return k; +} diff --git a/src/GPU/cg_solver_gpu.h b/src/GPU/cg_solver_gpu.h new file mode 100755 index 0000000..c672817 --- /dev/null +++ b/src/GPU/cg_solver_gpu.h @@ -0,0 +1,10 @@ +/*Contains prototypes for gpu_cg_solver functions.*/ + +#ifndef CG_SOLVER_GPU_H +#define CG_SOLVER_GPU_H +#include "../misc/utils.h" +#include "gpu_utils.h" + +int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data); + +#endif \ No newline at end of file diff --git a/src/GPU/ckernels.cu b/src/GPU/ckernels.cu new file mode 100755 index 0000000..34b25d9 --- /dev/null +++ b/src/GPU/ckernels.cu @@ -0,0 +1,124 @@ +/*Contains implementation of custom kernels for CUDA devices.*/ + +#include "ckernels.h" +#include +#include + +#define threadsPerBlock 128 + +const char* cublasGetErrorString(cublasStatus_t status) +{ + switch(status) + { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + return "unknown error"; +} + +__global__ void cudaDdiv_scalar(double *res, double *divided, double *divider){ + /*Division of scalar elements on a single CUDA thread*/ + if (threadIdx.x == 0) { + res[0] = divided[0] / divider[0]; + } +} + +__global__ void cudaDaxpy(int num_elements, double *alpha, double *x, double *y){ + /*Perform computations of AXPY operations: y[i] = y[i] + alpha * x[i]*/ + unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < num_elements) { + y[i] = y[i] + alpha[0] * x[i]; + } +} + +__global__ void cudaDaxpyz(int num_elements, double *alpha, double *y, double *x, double *z){ + /*Compute y[i] = alpha * x[i] + z[i] */ + unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < num_elements) { + y[i] = alpha[0] * x[i] + z[i]; + } +} + +__global__ void copy_device_variables(double *destination, double *source){ + + if (threadIdx.x==0){ + *destination = *source; + } +} + +__global__ void copy_device_vectors(int numElements, double *destination, double *source){ + + unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) { + destination[i] = source[i]; + } +} + +__global__ void cudaDElementWiseMult(int numElements, double *x, double *y, double *res){ + + __shared__ double temp[threadsPerBlock]; + + int i = threadIdx.x + blockIdx.x * blockDim.x; + int tid = threadIdx.x; + temp[tid] = 0; + res[i] = 0; + int blockID = blockIdx.x; + __syncthreads(); + + if (i < numElements){ + temp[tid]= x[i] * y[i]; + } + __syncthreads(); + + for (int s=blockDim.x/2; s>32; s>>=1){ + if (tid +#include + +const char* cublasGetErrorString(cublasStatus_t status); +__global__ void cudaDdiv_scalar(double *res, double *divided, double *divider); +__global__ void cudaDaxpy(int num_elements, double *alpha, double *x, double *y); +__global__ void cudaDaxpyz(int num_elements, double *alpha, double *x, double *y, double *z); +__global__ void copy_device_variables(double *destination, double *source); +__global__ void copy_device_vectors(int numElements, double *destination, double *source); +__global__ void cudaDElementWiseMult(int numElements, double *x, double *y, double *res); +__global__ void cudaDvect_sum(double *input, double *res, int blocksPerGrid); diff --git a/src/gpu_utils.cu b/src/GPU/gpu_utils.cu old mode 100644 new mode 100755 similarity index 90% rename from src/gpu_utils.cu rename to src/GPU/gpu_utils.cu index 1b0961d..9b632c4 --- a/src/gpu_utils.cu +++ b/src/GPU/gpu_utils.cu @@ -5,6 +5,7 @@ extern "C" { #include #include "gpu_utils.h" +#include "../misc/logger.h" } GPU_data *get_gpu_devices_data(){ @@ -15,14 +16,13 @@ GPU_data *get_gpu_devices_data(){ cudaError_t device_error; device_error = cudaGetDeviceCount(&gpu_data->devices_number); if (device_error != cudaSuccess) - printf("Error - could not read properly number of device, err=[%s] \n", cudaGetErrorString(device_error)); + logger(LEVEL_WARNING, "Could not read properly number of devices, err=%s", cudaGetErrorString(device_error)) if (gpu_data->devices_number != 0){ gpu_data->devices = (GPU_device *)malloc(gpu_data->devices_number * sizeof(GPU_device)); for (int i = 0; i < gpu_data->devices_number; i ++){ cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, i); - gpu_data->devices[i].name = devProp.name; gpu_data->devices[i].warp_size = devProp.warpSize; gpu_data->devices[i].max_threads_per_block = devProp.maxThreadsPerBlock; diff --git a/src/gpu_utils.h b/src/GPU/gpu_utils.h old mode 100644 new mode 100755 similarity index 100% rename from src/gpu_utils.h rename to src/GPU/gpu_utils.h diff --git a/src/cg_colver_gpu.cu b/src/cg_colver_gpu.cu deleted file mode 100644 index 1ef32ae..0000000 --- a/src/cg_colver_gpu.cu +++ /dev/null @@ -1,170 +0,0 @@ -/*Contains implementation for gpu_cg_solver functions.*/ - -#include -#include "ckernels.h" - - -extern "C" -{ -#include "utils.h" -#include "gpu_utils.h" -#include "cg_colver_gpu.h" -#include -#include -#include -} - -#define threadsPerBlock 256 -#define CHECK_FOR_STATUS(status) printf("cublas status = %s\n", cublasGetErrorString(status)) - -#define FREE_DEVICE_STACK \ - cudaFree(d_r);\ - cudaFree(d_helper);\ - cudaFree(d_x);\ - cudaFree(d_rhs);\ - cudaFree(d_d);\ - cudaFree(d_Ax);\ - cudaFree(d_q);\ - cudaFree(d_val);\ - cudaFree(d_I);\ - cudaFree(d_J);\ - cudaFree(d_beta);\ - cudaFree(d_alfa);\ - cudaFree(d_alpha_zero);\ - cudaFree(d_dot_new);\ - cudaFree(d_norm);\ - cudaFree(d_dot_zero);\ - cudaFree(d_dot_old);\ - cudaFree(d_dTq); - - -int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){ - /*Single GPU CG solver using cublas*/ - double *h_dot, *h_dot_zero; - int *d_I = NULL, *d_J = NULL; - const double tol = 1e-2f; - double *d_alfa, *d_beta, *d_alpha_zero; - double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val; - int k, max_iter; - - k = 0; - h_dot = 0; - h_dot_zero = 0; - max_iter = 200; - - size_t size = matrix->size * sizeof(double); - size_t d_size = sizeof(double); - - cusparseHandle_t cusparseHandle = 0; - cusparseCreate(&cusparseHandle); - - cusparseMatDescr_t descr = 0; - cusparseCreateMatDescr(&descr); - - cublasHandle_t cublasHandle = 0; - cublasCreate(&cublasHandle); - - cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO); - - cublasStatus_t cublasStatus; - - printf("Mallocing CUDA divice memory\n"); - cudaMalloc((void **)&d_r, size); - cudaMalloc((void **)&d_helper, size); - cudaMalloc((void **)&d_x, size); - cudaMalloc((void **)&d_rhs, size); - cudaMalloc((void **)&d_d, size); - cudaMalloc((void **)&d_Ax, size); - cudaMalloc((void **)&d_q, size); - cudaMalloc((void **)&d_val, matrix->non_zero * sizeof(double)); - cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(int)); - cudaMalloc((void **)&d_I, (matrix->size + 1) * sizeof(int)); - - cudaMalloc((void **)&d_beta, d_size); - cudaMalloc((void **)&d_alfa, d_size); - cudaMalloc((void **)&d_alpha_zero, d_size); - cudaMalloc((void **)&d_dot_new, d_size); - cudaMalloc((void **)&d_dot_zero, d_size); - cudaMalloc((void **)&d_norm, d_size); - - cudaMemset(d_beta, 0, d_size); - cudaMemset(d_alfa, 0, d_size); - cudaMemset(d_alpha_zero, 0, d_size); - cudaMemset(d_dot_new, 0, d_size); - cudaMemset(d_dot_zero, 0, d_size); - cudaMemset(d_norm, 0, d_size); - - printf("Copying to device\n"); - cudaMemcpy(d_val, matrix->val, matrix->non_zero * sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_J, matrix->J_row, matrix->non_zero * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_x, x_vec, size, cudaMemcpyHostToDevice); - - int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock ); - while (blocksPerGrid % threadsPerBlock != 0){ - blocksPerGrid++; - } - double alpha = 1.0; - double beta = 0.0; - - const double one = 1.0; - const double minus_one = -1.0; - /*Calculate Ax matrix*/ - - cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero, - &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax); - /*Calculate rhs=rhs-Ax matrix*/ - cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1); - CHECK_FOR_STATUS(cublasStatus); - - /*CG: Copy updated rhs (residuum) to d vector*/ - cublasStatus = cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1); - CHECK_FOR_STATUS(cublasStatus); - - /*CG: calculate dot r'*r, assign it to d_dot_new */ - cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new); - CHECK_FOR_STATUS(cublasStatus); - - /*assign dot_new to dot_zero*/ - d_dot_zero = d_dot_new; - cudaMemcpy(h_dot, d_dot_new, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(h_dot_zero, d_dot_zero, sizeof(double), cudaMemcpyDeviceToHost); - while ((*h_dot > tol * tol * *h_dot_zero) && (k < max_iter)) { - /*Calculate q=A*d vector*/ - cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero, - &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax); - /*Calculate alpha:*/ - cublasStatus = cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq); - CHECK_FOR_STATUS(cublasStatus); - - sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq); - /*Calculate x=x+alpha*d*/ - cublasStatus = cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1); - CHECK_FOR_STATUS(cublasStatus); - - /*Calculate r=r-alpha*q*/ - axpy<<>>(matrix->size, -1, d_q, d_rhs); - /*Assign dot_old = dot_new*/ - cublasStatus = cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot_new, 1); - CHECK_FOR_STATUS(cublasStatus); - - /*CG:Assign dot_new = r'*r*/ - cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new); - CHECK_FOR_STATUS(cublasStatus); - - sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old); - /*Scale beta*d*/ - cublasStatus = cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1); - CHECK_FOR_STATUS(cublasStatus); - - /*CG:Calculate d=r+beta*d*/ - cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1); - CHECK_FOR_STATUS(cublasStatus); - k++; - } - cusparseDestroy(cusparseHandle); - cudaDeviceReset(); - FREE_DEVICE_STACK - return k; -} diff --git a/src/cg_colver_gpu.h b/src/cg_colver_gpu.h deleted file mode 100644 index 9ba1e10..0000000 --- a/src/cg_colver_gpu.h +++ /dev/null @@ -1,10 +0,0 @@ -/*Contains prototypes for gpu_cg_solver functions.*/ - -#ifndef CG_SOLVER_GPU_H -#define CG_SOLVER_GPU_H -#include "utils.h" -#include "gpu_utils.h" - -int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data); - -#endif \ No newline at end of file diff --git a/src/ckernels.cu b/src/ckernels.cu deleted file mode 100644 index 2fb3b0d..0000000 --- a/src/ckernels.cu +++ /dev/null @@ -1,37 +0,0 @@ -/*Contains implementation of custom kernels for CUDA devices.*/ - -#include "ckernels.h" -#include - -const char* cublasGetErrorString(cublasStatus_t status) -{ - switch(status) - { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - return "unknown error"; -} - -__global__ void sDdiv(double *res, double *divided, double *divider) { - /*Division of scalar elements on a single CUDA thread*/ - if (threadIdx.x == 0) { - res[0] = divided[0] / divider[0]; - } -} - -__global__ void axpy(int num_elements, double alpha, double *x, double *y) { - /*Perform computations of AXPY operations: y[i] = y[i] + alpha * x[i]*/ - unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i < num_elements) { - y[i] = y[i] + alpha * x[i]; - } -} - diff --git a/src/ckernels.h b/src/ckernels.h deleted file mode 100644 index d4dbb2e..0000000 --- a/src/ckernels.h +++ /dev/null @@ -1,7 +0,0 @@ -/*Contains prototypes of custom kernels for CUDA devices.*/ - -#include - -const char* cublasGetErrorString(cublasStatus_t status); -__global__ void sDdiv(double *res, double *divided, double *divider); -__global__ void axpy(int num_elements, double alpha, double *x, double *y); diff --git a/src/conjugate_gradients.c b/src/conjugate_gradients.c new file mode 100755 index 0000000..763a690 --- /dev/null +++ b/src/conjugate_gradients.c @@ -0,0 +1,73 @@ +/* File contains implementation of Conjugate Gradient method using Intel MKL library and nVidia CUDA. + * + * Author: Pawel Stoworowicz + * + * */ + +#define FREE_STACK_COMMON \ + free(matrix);\ + free(input_cfg);\ + +#define FREE_STACK_ALL \ + FREE_STACK_COMMON\ + free(res_vec);\ + free(b_vec);\ + free(x_vec); + +//mkl_free(res_vec);\ + mkl_free(b_vec);\ + mkl_free(x_vec); + + +#include +#include +#include "stdio.h" +#include "misc/utils.h" +#include "mkl.h" +#include "misc/logger.h" + + +int main(int argc, char **argv) { + /*Main function - thanks captain obvious!*/ + logger_clean(); + logger(LEVEL_INFO, "%s", "Conjugate Gradient solver using MKL."); + InputConfig *input_cfg; + input_cfg = arg_parser(argc, argv); + logger(LEVEL_INFO, "Num of threads: %d", input_cfg->num_of_threads); + omp_set_num_threads(input_cfg->num_of_threads); + MatrixInt *matrix; + matrix = getMatrixCRSInt(input_cfg); + if (matrix == NULL) { + logger(LEVEL_ERROR, "%s", "ERROR - could not read matrix, exiting."); + FREE_STACK_COMMON; + exit(1); + } + logger(LEVEL_DEBUG, "%s", "Matrix parameters:"); + logger(LEVEL_DEBUG, "Size = %d, non-zero elements = %d", (int) matrix->size, (int) matrix->non_zero); + + double *x_vec, *b_vec, *res_vec; + double t_start, t_stop; + /* + x_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64); + b_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64); + res_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64); + */ + + x_vec = (double *) malloc(matrix->size * sizeof(double)); + b_vec = (double *) malloc(matrix->size * sizeof(double)); + res_vec = (double *) malloc(matrix->size * sizeof(double)); + + int total_iter = 0; + for (int i = 0; i < matrix->size; i++) { + x_vec[i] = 1.0; + b_vec[i] = 0.0; + res_vec[i] = 0.0; + } + t_start = get_time(); + total_iter = launch_solver(matrix, x_vec, b_vec, res_vec, input_cfg); + t_stop = get_time(); + logger(LEVEL_INFO, "Conjugate gradient method finished within %.3f [secs] in total %d iterations.", t_stop - t_start, total_iter); + + FREE_STACK_ALL; + exit(0); +} diff --git a/src/misc/logger.h b/src/misc/logger.h new file mode 100755 index 0000000..52fc570 --- /dev/null +++ b/src/misc/logger.h @@ -0,0 +1,79 @@ +/* Simple logger code. Nothing fancy here. + * + * Author: Pawel Stoworowicz + * + * */ + +#ifndef LOGGER_H +#define LOGGER_H + +#ifdef DEBUG +#define DDEBUG 1 +#else +#define DDEBUG 0 +#endif + +#define LEVEL_DEBUG -1 +#define LEVEL_INFO 0 +#define LEVEL_WARNING 1 +#define LEVEL_ERROR 2 + +#ifndef DEBUG_FLAG +#define DEBUG_FLAG 0 +#endif + +#include +#include +#include "utils.h" +#include + +#define _LOG_FIL_NAME "execution.log" + +#define logger_clean()\ + FILE *logger_fp;\ + logger_fp = fopen(_LOG_FIL_NAME, "w+");\ + fclose(logger_fp);\ + +#define log_to_file(buffer, args...){\ + FILE *logger_fp;\ + logger_fp = fopen(_LOG_FIL_NAME, "a");\ + fprintf(logger_fp, buffer, ##args);\ + fclose(logger_fp);\ +} + +/*GCC prints warning regarding args... notation - works fine for CLang!*/ +#define GCC_COMPILER (defined(__GNUC__) && !defined(__clang__)) +#if GCC_COMPILER +#pragma GCC diagnostic ignored "-Wformat-extra-args" +#endif + +#define _LOG_DATA_STDERR(buffer, fmt, args...){\ + strcat(buffer, fmt); strcat(buffer, "\n"); fprintf(stderr, buffer, ##args); log_to_file(buffer, ##args);\ + } + +#define _LOG_DATA_STDOUT(buffer, fmt, args...){\ + strcat(buffer, fmt); strcat(buffer, "\n"); fprintf(stdout, buffer, ##args); log_to_file(buffer, ##args);\ + } + + +#define logger(log_level, fmt, args...){\ + char buffer[1024*sizeof(char)];\ + struct tm* tm_info;\ + time_t timer;\ + time(&timer);\ + tm_info = localtime(&timer);\ + strftime(buffer, 1024*sizeof(char), "%T", tm_info);\ + switch(log_level){\ + case LEVEL_INFO: strcat(buffer, " [INFO] "); _LOG_DATA_STDOUT(buffer, fmt, ##args) break;\ + case LEVEL_WARNING: strcat(buffer, " [WARNING] "); _LOG_DATA_STDOUT(buffer, fmt, ##args) break;\ + case LEVEL_ERROR: strcat(buffer, " ***[ERROR]*** "); _LOG_DATA_STDERR(buffer, fmt, ##args) break;\ + case LEVEL_DEBUG: if(DDEBUG){strcat(buffer, " [DEBUG] "); _LOG_DATA_STDOUT(buffer, fmt, ##args);\ + break;}\ + else{break;}\ + default: strcat(buffer, " [ERROR] Unknown level: "); break;\ + }/*switch*/\ +}/*logger*/ +#if GCC_COMPILER +#pragma GCC diagnostic pop +#endif /*pragma*/ +#endif /*LOGGER_H*/ diff --git a/src/misc/math_functions.c b/src/misc/math_functions.c new file mode 100755 index 0000000..a410239 --- /dev/null +++ b/src/misc/math_functions.c @@ -0,0 +1,14 @@ +/*Contains implementations of custom math functions*/ + +#include +#include +#include "math_functions.h" + +double euclidean_norm(int size, double *vector){ + double res = 0.0; + #pragma omp parallel for + for (int i = 0; i < size; i++){ + res += pow(vector[i], 2); + } + return sqrt(res); +} diff --git a/src/misc/math_functions.h b/src/misc/math_functions.h new file mode 100755 index 0000000..873cf55 --- /dev/null +++ b/src/misc/math_functions.h @@ -0,0 +1,3 @@ +/*Contains prototypes for custom math functions*/ + +double euclidean_norm(int size, double *vector); diff --git a/src/utils.c b/src/misc/utils.c old mode 100644 new mode 100755 similarity index 57% rename from src/utils.c rename to src/misc/utils.c index a467e70..44b1c7f --- a/src/utils.c +++ b/src/misc/utils.c @@ -8,13 +8,14 @@ #include #include -#include "cg_solver.h" -#include "pcg_solver.h" +#include "../CPU/cg_solver.h" +#include "../CPU/pcg_solver.h" #include "utils.h" #include "mkl.h" -#include "gpu_utils.h" -#include "cg_colver_gpu.h" +#include "../GPU/gpu_utils.h" +#include "../GPU/cg_solver_gpu.h" #include +#include "logger.h" double get_time(){ /* Return ``double`` time stamp for execution time measurements. Implementation depends on OS. */ @@ -43,7 +44,7 @@ Matrix* getMatrixCRS(InputConfig *input_cfg){ FILE *pf; pf = fopen(fil_name, "r"); if(pf == NULL){ - printf("Can't open the file: %s", fil_name); + logger(LEVEL_ERROR, "Can't open the file: %s", fil_name); } double v; @@ -57,7 +58,7 @@ Matrix* getMatrixCRS(InputConfig *input_cfg){ matrix->size = matsize; matrix->non_zero = non_zero; matrix->I_column = (MKL_INT *)mkl_malloc((matsize + 1) * sizeof(MKL_INT), 64); - matrix->J_row = (MKL_INT *)mkl_malloc(non_zero * sizeof(MKL_INT), 64 ); + matrix->J_row = (MKL_INT *)mkl_malloc(non_zero * sizeof(MKL_INT), 64); matrix->val = (double *)mkl_malloc(non_zero * sizeof(double), 64); int n = 0; @@ -80,6 +81,52 @@ Matrix* getMatrixCRS(InputConfig *input_cfg){ return matrix; } +MatrixInt* getMatrixCRSInt(InputConfig *input_cfg){ + /* Load Matrix stored in CSR format from file.*/ + char *fil_name; + fil_name = input_cfg->filename; + if (fil_name == NULL) + return NULL; + FILE *pf; + pf = fopen(fil_name, "r"); + if(pf == NULL){ + logger(LEVEL_ERROR, "Can't open the file: %s", fil_name); + } + + double v; + int c, non_zero, inx; + int matsize, max_rows; + + // Load first line of file with information about size and non zero element in matrix + fscanf(pf,"%d %d %d", &matsize, &non_zero, &max_rows); + + MatrixInt *matrix = (MatrixInt *)malloc(sizeof(MatrixInt)); + matrix->size = matsize; + matrix->non_zero = non_zero; + matrix->I_column = (int *)malloc((matsize + 1) * sizeof(int)); + matrix->J_row = (int *)malloc(non_zero * sizeof(int)); + matrix->val = (double *)malloc(non_zero * sizeof(double)); + + int n = 0; + while( ! feof(pf)){ + if(n < matsize + 1){ + fscanf(pf, "%le %d %d", &v, &c, &inx); + matrix->val[n] = v; + matrix->J_row[n] = c; + matrix->I_column[n] = inx; + } + else{ + fscanf(pf, "%le %d", &v, &c); + matrix->val[n] = v; + matrix->J_row[n] = c; + } + n++; + } + fclose(pf); + pf = NULL; + return matrix; +} + InputConfig* arg_parser(int argc, char **argv){ /*Parse argument line*/ InputConfig *input_cfg = (InputConfig *)malloc(sizeof(InputConfig)); @@ -109,24 +156,24 @@ InputConfig* arg_parser(int argc, char **argv){ return input_cfg; } -int launch_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg){ +int launch_solver(MatrixInt *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg){ /*Launch proper solver based on input config data.*/ GPU_data *gpu_data; gpu_data = get_gpu_devices_data(); int total_iter = 0; if (gpu_data->devices_number > 0 && input_cfg->gpu == 1){ - printf("Launching GPU CG.\n"); - printf("Number of CUDA devices: %d\n", gpu_data->devices_number); + logger(LEVEL_INFO, "%s", "Launching GPU CG."); + logger(LEVEL_INFO, "Number of CUDA devices: %d", gpu_data->devices_number); cudaDeviceReset(); total_iter = gpu_conjugate_gradient_solver(matrix, x_vec, b_vec, res_vec, gpu_data); } else { if (input_cfg->preconditioner == NULL){ - printf("Launching CPU CG.\n"); - total_iter = conjugate_gradient_solver(matrix, x_vec, b_vec, res_vec); + logger(LEVEL_INFO, "%s", "Launching CPU CG."); + //total_iter = conjugate_gradient_solver(matrix, x_vec, b_vec, res_vec); } else { - printf("Launching CPU PCG with %s preconditioner.\n", input_cfg->preconditioner); - total_iter = pcg_solver(matrix, x_vec, b_vec, res_vec, input_cfg->preconditioner); + logger(LEVEL_INFO, "Launching CPU PCG with %s preconditioner.", input_cfg->preconditioner); + //total_iter = pcg_solver(matrix, x_vec, b_vec, res_vec, input_cfg->preconditioner); } } free(gpu_data); diff --git a/src/utils.h b/src/misc/utils.h old mode 100644 new mode 100755 similarity index 61% rename from src/utils.h rename to src/misc/utils.h index 2f1a169..ae7fb6d --- a/src/utils.h +++ b/src/misc/utils.h @@ -14,6 +14,14 @@ typedef struct { double *val; }Matrix; +typedef struct { + int size; + int non_zero; + int *I_column; + int *J_row; + double *val; +}MatrixInt; + typedef struct{ char *filename; int num_of_threads; @@ -23,7 +31,10 @@ typedef struct{ Matrix* getMatrixCRS(InputConfig *input_cfg); +MatrixInt* getMatrixCRSInt(InputConfig *input_cfg); + InputConfig* arg_parser(int argc, char **argv); -int launch_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg); +int launch_solver(MatrixInt *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg); + #endif \ No newline at end of file diff --git a/src/solver.c b/src/solver.c deleted file mode 100644 index 395c0db..0000000 --- a/src/solver.c +++ /dev/null @@ -1,60 +0,0 @@ -/* File contains implementation of Conjugate Gradient method using Intel MKL library. - * - * Author: Pawel Stoworowicz - * Contact: stoworow@gmail.com - * - * */ - -#define FREE_STACK_COMMON \ - free(matrix);\ - free(input_cfg);\ - -#define FREE_STACK_ALL \ - FREE_STACK_COMMON\ - mkl_free(res_vec);\ - mkl_free(b_vec);\ - mkl_free(x_vec) - -#include -#include -#include "stdio.h" -#include "utils.h" -#include "mkl.h" - - -int main(int argc, char** argv){ - /*Main function - thanks captain obvious!*/ - printf("Conjugate Gradient solver using MKL.\n"); - InputConfig *input_cfg; - input_cfg = arg_parser(argc, argv); - printf("Num of threads: %d\n", input_cfg->num_of_threads); - omp_set_num_threads(input_cfg->num_of_threads); - Matrix *matrix; - matrix = getMatrixCRS(input_cfg); - if (matrix == NULL){ - printf("ERROR - could not read matrix, exiting.\n"); - FREE_STACK_COMMON; - exit(1); - } - printf("Matrix parameters:\n"); - printf("Size = %d, non-zero elements = %d\n", (int)matrix->size, (int)matrix->non_zero); - - double *x_vec, *b_vec, *res_vec; - double t_start, t_stop; - x_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64); - b_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64); - res_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64); - int total_iter = 0; - for (int i = 0; i < matrix->size; i++) { - x_vec[i] = 1; - b_vec[i] = 0; - res_vec[i] = 0; - } - t_start = get_time(); - total_iter = launch_solver(matrix, x_vec, b_vec, res_vec, input_cfg); - t_stop = get_time(); - printf("Conjugate gradient method finished within %.3f [secs] in total %d iterations.\n", t_stop - t_start, total_iter); - - FREE_STACK_ALL; - exit(0); -}