diff --git a/Makefile b/Makefile
old mode 100644
new mode 100755
index 1e8a99a..aa6dc15
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,9 @@
 CC := gcc
 SRCEXT := c
 HEADEREXT := h
-CFLAGS := -Wall -Werror -Wmissing-prototypes -std=c11 -O2
+
+DEBUG_FLAGS := -g -DDEBUG
+CFLAGS := -Wall -Wmissing-prototypes -Wno-unused-result -std=c11 -O2 $(DEBUG_FLAGS) -fopenmp
 
 NVCC := /usr/local/cuda/bin/nvcc
 CUDAFLAGS :=
@@ -10,8 +12,10 @@ NVSRCEXT := cu
 NVHEADEREXT := h
 
 TARGET = ConjugateGradient
+TARGET_DEBUG = debug_ConjugateGradient
 SRCDIR := src
 BUILDDIR := build
+BUILDDIR_DEBUG = build_debug
 
 SOURCES := $(shell find $(SRCDIR) -type f -name *.$(SRCEXT))
 HEADERS = $(wildcard $(SRCDIR)/*.$(HEADEREXT))
@@ -35,23 +39,25 @@ LIBRARY_DIRS := $(MKLROOT)/lib/intel64
 LFLAGS := $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CFLAGS += $(foreach includedir,$(INCLUDE_DIRS),-I$(includedir))
 CFLAGS += -DMKL_ILP64 -m64 -I${MKLROOT}/include
+
 # static liking to MKL
 LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS)/libmkl_intel_thread.a $(LIBRARY_DIRS)/libmkl_core.a -Wl,--end-group
-LDFLAGS +=  -liomp5 -lpthread -lm -ldl
+LDFLAGS +=  -liomp5 -lpthread -lm -ldl -lstdc++
 
 # add cuda flags
 # -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc
-CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64
+CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64 $(DEBUG_FLAGS)
 
 $(TARGET): $(OBJECTS)
 	@echo " Linking..."
 	$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS)
 
 $(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT)
+	@mkdir -p $(@D)
 	$(NVCC) $(CUDAFLAGS) -c -o $@ $<
 
 $(BUILDDIR)/%.o: $(SRCDIR)/%.$(SRCEXT)
-	@mkdir -p $(BUILDDIR)
+	@mkdir -p $(@D)
 	$(CC) $(CFLAGS) $(INC) -c -o $@ $<
 
 valgrind:
@@ -60,8 +66,9 @@ valgrind:
 clean:
 	@echo " Cleaning...";
 	@echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET)
+	@echo " $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)"; $(RM) -r $(BUILDDIR_DEBUG) $(TARGET_DEBUG)
 
 cuda_info:
 	nvidia-smi -a
 
-.PHONY: default all clean
\ No newline at end of file
+.PHONY: default all clean debug cuda_info
\ No newline at end of file
diff --git a/README.rst b/README.rst
old mode 100644
new mode 100755
diff --git a/doc/arn_matrix.png b/doc/arn_matrix.png
old mode 100644
new mode 100755
diff --git a/doc/cg_conv_visual.png b/doc/cg_conv_visual.png
old mode 100644
new mode 100755
diff --git a/doc/comparison.png b/doc/comparison.png
old mode 100644
new mode 100755
diff --git a/doc/crn_matrix.png b/doc/crn_matrix.png
old mode 100644
new mode 100755
diff --git a/doc/rnqa_matrix.png b/doc/rnqa_matrix.png
old mode 100644
new mode 100755
diff --git a/src/cg_solver.c b/src/CPU/cg_solver.c
old mode 100644
new mode 100755
similarity index 80%
rename from src/cg_solver.c
rename to src/CPU/cg_solver.c
index c948c18..8909df6
--- a/src/cg_solver.c
+++ b/src/CPU/cg_solver.c
@@ -8,6 +8,8 @@
 #include <stdlib.h>
 #include "cg_solver.h"
 #include "mkl.h"
+#include "../misc/logger.h"
+#include "../misc/math_functions.h"
 
 #define FREE_STACK \
 	mkl_free(Ax);\
@@ -22,7 +24,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
 	 * */
 
 	int k, max_iter;
-	double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old;
+	double beta, tol, alpha, dTq, dot_zero, dot_new, dot_old, norm;
 	double *Ax, *x, *d, *q;
 	k = 0;
 	beta = 0.0;
@@ -33,21 +35,23 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
 	x = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
 	q = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
 	Ax = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
-
 	/*Calculate Ax matrix*/
 	mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, x_vec, Ax);
+	norm = euclidean_norm(matrix->size, Ax); logger(LEVEL_DEBUG, "x_vec norm value: %f", norm);
 	/*Calculate rhs=rhs-Ax matrix*/
 	cblas_daxpy(matrix->size, -1.0, Ax, 1 , rhs, 1);
 	/*CG: Copy updated rhs (residuum) to d vector*/
 	cblas_dcopy(matrix->size, rhs, 1, d, 1);
+    norm = euclidean_norm(sizeof(double), d); logger(LEVEL_DEBUG, "d norm value: %f", norm);
 	/*CG: calculate dot r'*r, assign it to dot_new */
 	dot_new = cblas_ddot(matrix->size, rhs, 1 ,rhs, 1);
+    norm = euclidean_norm(sizeof(double), &dot_new); logger(LEVEL_DEBUG, "dot_new norm value: %f", dot_new);
 	/*assign dot_new to dot_zero*/
 	dot_zero = dot_new;
 	while (dot_new >  tol * tol * dot_zero && k < max_iter) {
 		/*Calculate q=A*d vector*/
 		mkl_cspblas_dcsrgemv("N", &(matrix->size), matrix->val, matrix->I_column, matrix->J_row, d, q);
-		/*Calculate alpha:*/
+		/*Calculate alpha = dot_new/dTq */
 		dTq = cblas_ddot(matrix->size, d, 1, q, 1);
 		alpha = dot_new / dTq;
 		/*Calculate x=x+alpha*d*/
@@ -58,6 +62,7 @@ int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double
 		dot_old = dot_new;
 		/*CG:Assign dot_new = r'*r*/
 		dot_new = cblas_ddot(matrix->size, rhs, 1, rhs, 1);
+		/* beta = d_new/d_old */
 		beta = dot_new / dot_old;
 		/*Scale beta*d*/
 		cblas_dscal(matrix->size, beta, d, 1);
diff --git a/src/cg_solver.h b/src/CPU/cg_solver.h
old mode 100644
new mode 100755
similarity index 84%
rename from src/cg_solver.h
rename to src/CPU/cg_solver.h
index eb2bb18..3e5dc04
--- a/src/cg_solver.h
+++ b/src/CPU/cg_solver.h
@@ -1,5 +1,5 @@
 /* Header file for Conjugate Gradient method */
 
-#include "utils.h"
+#include "../misc/utils.h"
 
 int conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec);
diff --git a/src/pcg_solver.c b/src/CPU/pcg_solver.c
old mode 100644
new mode 100755
similarity index 97%
rename from src/pcg_solver.c
rename to src/CPU/pcg_solver.c
index 5824fc4..ca87f9f
--- a/src/pcg_solver.c
+++ b/src/CPU/pcg_solver.c
@@ -9,6 +9,7 @@
 #include <stdlib.h>
 #include "pcg_solver.h"
 #include "mkl.h"
+#include "../misc/logger.h"
 
 #define FREE_STACK mkl_free(Ax);\
 	mkl_free(q);\
@@ -52,7 +53,7 @@ double* perform_preconditioning(Matrix *matrix, double *vec, pc_cfg *pc_config){
 			res[i] = vec[i] * pc_config->inverted_diagonal[i];
 		}
 	} else {
-		printf("No such preconditioner: %s, exiting", pc_config->preconditioner);
+		logger(LEVEL_ERROR, "No such preconditioner: %s, exiting", pc_config->preconditioner);
 		exit(1);
 	}
 	return res;
diff --git a/src/pcg_solver.h b/src/CPU/pcg_solver.h
old mode 100644
new mode 100755
similarity index 94%
rename from src/pcg_solver.h
rename to src/CPU/pcg_solver.h
index e19f6e6..e08e65c
--- a/src/pcg_solver.h
+++ b/src/CPU/pcg_solver.h
@@ -1,6 +1,6 @@
 /* Header file for Preconditioned Conjugate Gradient method */
 
-#include "utils.h"
+#include "../misc/utils.h"
 
 typedef struct {
 	char *preconditioner;
diff --git a/src/GPU/cg_solver_gpu.cu b/src/GPU/cg_solver_gpu.cu
new file mode 100755
index 0000000..c76da11
--- /dev/null
+++ b/src/GPU/cg_solver_gpu.cu
@@ -0,0 +1,179 @@
+/*Contains implementation for gpu_cg_solver functions.*/
+
+#include <stdio.h>
+#include <stdint.h>
+#include "ckernels.h"
+#include "mkl.h"
+
+extern "C"
+{
+#include "../misc/utils.h"
+#include "gpu_utils.h"
+#include "cg_solver_gpu.h"
+#include <cuda_runtime.h>
+#include <cusparse_v2.h>
+#include <cublas_v2.h>
+#include "../misc/logger.h"
+#include "../misc/math_functions.h"
+}
+
+#define threadsPerBlock 128
+#define CHECK_FOR_STATUS(status) logger(LEVEL_DEBUG, "cublas status = %s", cublasGetErrorString(status))
+
+#define COPY_TO_DEVICE(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyHostToDevice)
+#define COPY_TO_HOST(dest, source, size) cudaMemcpy(dest, source, size, cudaMemcpyDeviceToHost)
+
+#define FREE_DEVICE_STACK \
+	cudaFree(d_r);\
+	cudaFree(d_helper);\
+	cudaFree(d_x);\
+	cudaFree(d_rhs);\
+	cudaFree(d_d);\
+	cudaFree(d_Ax);\
+	cudaFree(d_q);\
+	cudaFree(d_val);\
+	cudaFree(d_I);\
+	cudaFree(d_J);\
+	cudaFree(d_beta);\
+	cudaFree(d_alfa);\
+	cudaFree(d_alpha_zero);\
+	cudaFree(d_dot_new);\
+	cudaFree(d_norm);\
+	cudaFree(d_dot_zero);\
+	cudaFree(d_dot_old);\
+	cudaFree(d_dTq);
+
+
+int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
+	/*Single GPU CG solver using (mostly) cublas and custom kernels*/
+    int *d_I = NULL, *d_J = NULL;
+    int k, max_iter;
+	double *d_beta, *d_alpha_zero, *d_alfa;
+	double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
+    const double tol = 1e-2f;
+
+	double h_dot = 0;
+	double h_dot_zero = 0;
+	double alpha = 1.0;
+	double alpham1 = -1.0;
+	double beta = 0.0;
+
+    d_alpha_zero = &alpham1;
+
+    k = 0;
+	max_iter = 200;
+
+	size_t size = matrix->size * sizeof(double);
+	size_t d_size = sizeof(double);
+
+	cusparseHandle_t cusparseHandle = 0;
+	cusparseCreate(&cusparseHandle);
+
+	cusparseMatDescr_t descr = 0;
+	cusparseCreateMatDescr(&descr);
+
+	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
+
+	logger(LEVEL_DEBUG, "%s", "Mallocing CUDA device memory");
+	cudaMalloc((double **)&d_r, size);
+	cudaMalloc((double **)&d_helper, size);
+	cudaMalloc((double **)&d_x, size);
+	cudaMalloc((double **)&d_rhs,	size);
+	cudaMalloc((double **)&d_d, size);
+	cudaMalloc((double **)&d_Ax, size);
+	cudaMalloc((double **)&d_q, size);
+    cudaMalloc((double **)&d_alpha_zero, size);
+
+	cudaMalloc((double **)&d_val, matrix->non_zero * sizeof(double));
+	cudaMalloc((int **)&d_J, matrix->non_zero * sizeof(int));
+	cudaMalloc((int **)&d_I, (matrix->size + 1) * sizeof(int));
+
+    cudaMalloc((double **)&d_dTq, d_size);
+    cudaMalloc((double **)&d_beta, d_size);
+	cudaMalloc((double **)&d_alfa, d_size);
+	cudaMalloc((double **)&d_dot_new, d_size);
+	cudaMalloc((double **)&d_dot_zero, d_size);
+	cudaMalloc((double **)&d_norm, d_size);
+
+    logger(LEVEL_DEBUG, "%s", "Copying to device.");
+	COPY_TO_DEVICE(d_val, matrix->val, matrix->non_zero * sizeof(double));
+	COPY_TO_DEVICE(d_J, matrix->J_row, matrix->non_zero * sizeof(int));
+	COPY_TO_DEVICE(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int));
+	COPY_TO_DEVICE(d_x, x_vec, matrix->size * sizeof(double));
+	COPY_TO_DEVICE(d_rhs, rhs, matrix->size * sizeof(double));
+
+	int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
+	while (blocksPerGrid % threadsPerBlock != 0){
+		blocksPerGrid++;
+	}
+
+    /*Calculate Ax matrix*/
+	cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                   matrix->size, matrix->size, matrix->non_zero,  &alpha, descr, d_val, d_I, d_J, d_x, &beta, d_Ax);
+
+    /*Calculate rhs=rhs-Ax matrix*/
+    cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alpha_zero, d_Ax, d_rhs);
+
+    /*CG: Copy updated rhs (residuum) to d vector*/
+    copy_device_vectors<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_d, d_rhs);
+
+    /*CG: calculate dot r'*r, assign it to dot_new */
+	cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_rhs, d_rhs, d_helper);
+	cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dot_new, blocksPerGrid);
+
+    /*assign dot_new to dot_zero*/
+    copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_zero, d_dot_new);
+
+    COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double));
+    /*assign dot_new to dot_zero*/
+    h_dot_zero = h_dot;
+    printf("tol * tol * h_dot_zero = %f\n", tol * tol * h_dot_zero);
+    printf("h_dot = %f\n", h_dot);
+	while (h_dot >  tol * tol * h_dot_zero && k < max_iter) {
+		/*Calculate q=A*d vector*/
+		cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
+						  &alpha, descr, d_val, d_I, d_J, d_d, &beta, d_q);
+
+		/*Calculate dTq = d'*q*/
+        cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_d, d_q, d_helper);
+		/*Calculate alpha = dot_new/dTq*/
+        cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dTq, blocksPerGrid);
+        cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);
+
+        double tmp;
+        double *p_tmp;
+        p_tmp = &tmp;
+		COPY_TO_HOST(p_tmp, d_dot_new, sizeof(double));
+		logger(LEVEL_DEBUG, "d_dot_new val = %f", tmp);
+		COPY_TO_HOST(p_tmp, d_dTq, sizeof(double));
+		logger(LEVEL_DEBUG, "d_dTq val = %f", tmp);
+		COPY_TO_HOST(p_tmp, d_alfa, sizeof(double));
+		logger(LEVEL_DEBUG, "d_alfa val = %f", tmp);
+
+		/*Calculate x=x+alpha*d*/
+		cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alfa, d_d, d_x);
+
+		/*Calculate r=r-alpha*q*/
+		cudaDaxpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_alpha_zero, d_q, d_rhs);
+		/*Assign dot_old = dot_new*/
+        copy_device_variables<<<1, gpu_data->devices[0].warp_size>>>(d_dot_old, d_dot_new);
+
+		/*Calculate dot_new = r'*r*/
+        cudaDElementWiseMult<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_rhs, d_rhs, d_helper);
+        cudaDvect_sum<<<blocksPerGrid, threadsPerBlock>>>(d_helper, d_dot_new, blocksPerGrid);
+
+        /*Calculate beta = d_new/d_old */
+        cudaDdiv_scalar<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);
+
+		/*CG:Calculate d=r+beta*d*/
+		cudaDaxpyz<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, d_beta, d_d, d_d, d_r);
+        COPY_TO_HOST(&h_dot, d_dot_new, sizeof(double));
+        cudaThreadSynchronize();
+		k++;
+	}
+	cusparseDestroy(cusparseHandle);
+	cudaDeviceReset();
+	FREE_DEVICE_STACK
+	return k;
+}
diff --git a/src/GPU/cg_solver_gpu.h b/src/GPU/cg_solver_gpu.h
new file mode 100755
index 0000000..c672817
--- /dev/null
+++ b/src/GPU/cg_solver_gpu.h
@@ -0,0 +1,10 @@
+/*Contains prototypes for gpu_cg_solver functions.*/
+
+#ifndef CG_SOLVER_GPU_H
+#define CG_SOLVER_GPU_H
+#include "../misc/utils.h"
+#include "gpu_utils.h"
+
+int gpu_conjugate_gradient_solver(MatrixInt *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);
+
+#endif
\ No newline at end of file
diff --git a/src/GPU/ckernels.cu b/src/GPU/ckernels.cu
new file mode 100755
index 0000000..34b25d9
--- /dev/null
+++ b/src/GPU/ckernels.cu
@@ -0,0 +1,124 @@
+/*Contains implementation of custom kernels for CUDA devices.*/
+
+#include "ckernels.h"
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#define threadsPerBlock 128
+
+const char* cublasGetErrorString(cublasStatus_t status)
+{
+	switch(status)
+	{
+		case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+		case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+		case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+		case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+		case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+		case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+		case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+		case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+	}
+	return "unknown error";
+}
+
+__global__ void cudaDdiv_scalar(double *res, double *divided, double *divider){
+	/*Division of scalar elements on a single CUDA thread*/
+	if (threadIdx.x == 0) {
+		res[0] = divided[0] / divider[0];
+	}
+}
+
+__global__ void cudaDaxpy(int num_elements, double *alpha, double *x, double *y){
+	/*Perform computations of AXPY operations: y[i] = y[i] + alpha * x[i]*/
+	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (i < num_elements) {
+		y[i] = y[i] + alpha[0] * x[i];
+	}
+}
+
+__global__ void cudaDaxpyz(int num_elements, double *alpha, double *y, double *x, double *z){
+	/*Compute y[i] = alpha * x[i] + z[i] */
+	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (i < num_elements) {
+		y[i] = alpha[0] * x[i] + z[i];
+	}
+}
+
+__global__ void copy_device_variables(double *destination, double *source){
+
+    if (threadIdx.x==0){
+        *destination = *source;
+    }
+}
+
+__global__ void copy_device_vectors(int numElements, double *destination, double *source){
+
+	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+	if (i < numElements) {
+		destination[i] = source[i];
+	}
+}
+
+__global__ void cudaDElementWiseMult(int numElements, double *x, double *y, double *res){
+
+	__shared__ double temp[threadsPerBlock];
+
+	int i = threadIdx.x + blockIdx.x * blockDim.x;
+	int tid = threadIdx.x;
+	temp[tid] = 0;
+	res[i] = 0;
+	int blockID = blockIdx.x;
+	__syncthreads();
+
+	if (i < numElements){
+		temp[tid]= x[i] * y[i];
+	}
+	__syncthreads();
+
+	for (int s=blockDim.x/2; s>32; s>>=1){
+		if (tid<s){
+			temp[tid] += temp[tid+s];
+		}
+		__syncthreads();
+	}
+	if (tid<32){
+		temp[tid] += temp[tid+32];__syncthreads();
+		temp[tid] += temp[tid+16];__syncthreads();
+		temp[tid] += temp[tid+8];__syncthreads();
+		temp[tid] += temp[tid+4];__syncthreads();
+		temp[tid] += temp[tid+2];__syncthreads();
+		temp[tid] += temp[tid+1];__syncthreads();
+	}
+
+	__syncthreads();
+	if(tid == 0){
+		res[blockID] = temp[0];
+	}
+}
+
+__global__ void cudaDvect_sum(double *input, double *res, int blocksPerGrid){
+
+	__shared__ double temp2[threadsPerBlock];
+	__shared__ double temp3[2];
+	int tid = threadIdx.x;
+	temp3[0] = 0;
+	*res = 0;
+	temp2[tid] = 0;
+	__syncthreads();
+
+	for(int j = 0; j<(int)(blocksPerGrid/threadsPerBlock); j++){
+		temp2[tid] += input[tid*(int)floorf(blocksPerGrid/threadsPerBlock)+j];
+	}
+	__syncthreads();
+
+	if (tid == 0){
+		for (int z = 0; z< threadsPerBlock/2 ; z++){
+			temp3[0] += temp2[z] + temp2[z+threadsPerBlock/2];
+		}
+		res[0] = temp3[0];
+	}
+}
\ No newline at end of file
diff --git a/src/GPU/ckernels.h b/src/GPU/ckernels.h
new file mode 100755
index 0000000..9a8cd3d
--- /dev/null
+++ b/src/GPU/ckernels.h
@@ -0,0 +1,13 @@
+/*Contains prototypes of custom kernels for CUDA devices.*/
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+const char* cublasGetErrorString(cublasStatus_t status);
+__global__ void cudaDdiv_scalar(double *res, double *divided, double *divider);
+__global__ void cudaDaxpy(int num_elements, double *alpha, double *x, double *y);
+__global__ void cudaDaxpyz(int num_elements, double *alpha, double *x, double *y, double *z);
+__global__ void copy_device_variables(double *destination, double *source);
+__global__ void copy_device_vectors(int numElements, double *destination, double *source);
+__global__ void cudaDElementWiseMult(int numElements, double *x, double *y, double *res);
+__global__ void cudaDvect_sum(double *input, double *res, int blocksPerGrid);
diff --git a/src/gpu_utils.cu b/src/GPU/gpu_utils.cu
old mode 100644
new mode 100755
similarity index 90%
rename from src/gpu_utils.cu
rename to src/GPU/gpu_utils.cu
index 1b0961d..9b632c4
--- a/src/gpu_utils.cu
+++ b/src/GPU/gpu_utils.cu
@@ -5,6 +5,7 @@
 extern "C" {
 #include <cuda_runtime.h>
 #include "gpu_utils.h"
+#include "../misc/logger.h"
 }
 
 GPU_data *get_gpu_devices_data(){
@@ -15,14 +16,13 @@ GPU_data *get_gpu_devices_data(){
 	cudaError_t device_error;
 	device_error = cudaGetDeviceCount(&gpu_data->devices_number);
 	if (device_error != cudaSuccess)
-		printf("Error - could not read properly number of device, err=[%s] \n", cudaGetErrorString(device_error));
+		logger(LEVEL_WARNING, "Could not read properly number of devices, err=%s", cudaGetErrorString(device_error))
 
 	if (gpu_data->devices_number != 0){
 		gpu_data->devices = (GPU_device *)malloc(gpu_data->devices_number * sizeof(GPU_device));
 		for (int i = 0; i < gpu_data->devices_number; i ++){
 			cudaDeviceProp devProp;
 			cudaGetDeviceProperties(&devProp, i);
-
 			gpu_data->devices[i].name = devProp.name;
 			gpu_data->devices[i].warp_size = devProp.warpSize;
 			gpu_data->devices[i].max_threads_per_block = devProp.maxThreadsPerBlock;
diff --git a/src/gpu_utils.h b/src/GPU/gpu_utils.h
old mode 100644
new mode 100755
similarity index 100%
rename from src/gpu_utils.h
rename to src/GPU/gpu_utils.h
diff --git a/src/cg_colver_gpu.cu b/src/cg_colver_gpu.cu
deleted file mode 100644
index 1ef32ae..0000000
--- a/src/cg_colver_gpu.cu
+++ /dev/null
@@ -1,170 +0,0 @@
-/*Contains implementation for gpu_cg_solver functions.*/
-
-#include <stdio.h>
-#include "ckernels.h"
-
-
-extern "C"
-{
-#include "utils.h"
-#include "gpu_utils.h"
-#include "cg_colver_gpu.h"
-#include <cuda_runtime.h>
-#include <cusparse_v2.h>
-#include <cublas_v2.h>
-}
-
-#define threadsPerBlock 256
-#define CHECK_FOR_STATUS(status) printf("cublas status = %s\n", cublasGetErrorString(status))
-
-#define FREE_DEVICE_STACK \
-	cudaFree(d_r);\
-	cudaFree(d_helper);\
-	cudaFree(d_x);\
-	cudaFree(d_rhs);\
-	cudaFree(d_d);\
-	cudaFree(d_Ax);\
-	cudaFree(d_q);\
-	cudaFree(d_val);\
-	cudaFree(d_I);\
-	cudaFree(d_J);\
-	cudaFree(d_beta);\
-	cudaFree(d_alfa);\
-	cudaFree(d_alpha_zero);\
-	cudaFree(d_dot_new);\
-	cudaFree(d_norm);\
-	cudaFree(d_dot_zero);\
-	cudaFree(d_dot_old);\
-	cudaFree(d_dTq);
-
-
-int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
-	/*Single GPU CG solver using cublas*/
-	double *h_dot, *h_dot_zero;
-	int *d_I = NULL, *d_J = NULL;
-	const double tol = 1e-2f;
-	double *d_alfa, *d_beta, *d_alpha_zero;
-	double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
-	int k, max_iter;
-
-	k = 0;
-	h_dot = 0;
-	h_dot_zero = 0;
-	max_iter = 200;
-
-	size_t size = matrix->size * sizeof(double);
-	size_t d_size = sizeof(double);
-
-	cusparseHandle_t cusparseHandle = 0;
-	cusparseCreate(&cusparseHandle);
-
-	cusparseMatDescr_t descr = 0;
-	cusparseCreateMatDescr(&descr);
-
-	cublasHandle_t cublasHandle = 0;
-	cublasCreate(&cublasHandle);
-
-	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-	cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
-
-	cublasStatus_t cublasStatus;
-
-	printf("Mallocing CUDA divice memory\n");
-	cudaMalloc((void **)&d_r, size);
-	cudaMalloc((void **)&d_helper, size);
-	cudaMalloc((void **)&d_x, size);
-	cudaMalloc((void **)&d_rhs,	size);
-	cudaMalloc((void **)&d_d, size);
-	cudaMalloc((void **)&d_Ax, size);
-	cudaMalloc((void **)&d_q, size);
-	cudaMalloc((void **)&d_val, matrix->non_zero * sizeof(double));
-	cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(int));
-	cudaMalloc((void **)&d_I, (matrix->size + 1) * sizeof(int));
-
-	cudaMalloc((void **)&d_beta, d_size);
-	cudaMalloc((void **)&d_alfa, d_size);
-	cudaMalloc((void **)&d_alpha_zero, d_size);
-	cudaMalloc((void **)&d_dot_new, d_size);
-	cudaMalloc((void **)&d_dot_zero, d_size);
-	cudaMalloc((void **)&d_norm, d_size);
-
-	cudaMemset(d_beta, 0, d_size);
-	cudaMemset(d_alfa, 0, d_size);
-	cudaMemset(d_alpha_zero, 0, d_size);
-	cudaMemset(d_dot_new, 0, d_size);
-	cudaMemset(d_dot_zero, 0, d_size);
-	cudaMemset(d_norm, 0, d_size);
-
-	printf("Copying to device\n");
-	cudaMemcpy(d_val, matrix->val, matrix->non_zero * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_J, matrix->J_row, matrix->non_zero * sizeof(int), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_x, x_vec, size, cudaMemcpyHostToDevice);
-
-	int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
-	while (blocksPerGrid % threadsPerBlock != 0){
-		blocksPerGrid++;
-	}
-	double alpha = 1.0;
-	double beta = 0.0;
-
-	const double one = 1.0;
-	const double minus_one = -1.0;
-	/*Calculate Ax matrix*/
-
-	cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
-				   &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
-	/*Calculate rhs=rhs-Ax matrix*/
-	cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
-	CHECK_FOR_STATUS(cublasStatus);
-
-	/*CG: Copy updated rhs (residuum) to d vector*/
-	cublasStatus = cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
-	CHECK_FOR_STATUS(cublasStatus);
-
-	/*CG: calculate dot r'*r, assign it to d_dot_new */
-	cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
-	CHECK_FOR_STATUS(cublasStatus);
-
-	/*assign dot_new to dot_zero*/
-	d_dot_zero = d_dot_new;
-	cudaMemcpy(h_dot, d_dot_new,  sizeof(double), cudaMemcpyDeviceToHost);
-	cudaMemcpy(h_dot_zero, d_dot_zero,  sizeof(double), cudaMemcpyDeviceToHost);
-	while ((*h_dot >  tol * tol * *h_dot_zero) && (k < max_iter)) {
-		/*Calculate q=A*d vector*/
-		cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
-						  &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
-		/*Calculate alpha:*/
-		cublasStatus = cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
-		CHECK_FOR_STATUS(cublasStatus);
-
-		sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);
-		/*Calculate x=x+alpha*d*/
-		cublasStatus = cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
-		CHECK_FOR_STATUS(cublasStatus);
-
-		/*Calculate r=r-alpha*q*/
-		axpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, -1, d_q, d_rhs);
-		/*Assign dot_old = dot_new*/
-		cublasStatus = cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot_new, 1);
-		CHECK_FOR_STATUS(cublasStatus);
-
-		/*CG:Assign dot_new = r'*r*/
-		cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
-		CHECK_FOR_STATUS(cublasStatus);
-
-		sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);
-		/*Scale beta*d*/
-		cublasStatus = cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
-		CHECK_FOR_STATUS(cublasStatus);
-
-		/*CG:Calculate d=r+beta*d*/
-		cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
-		CHECK_FOR_STATUS(cublasStatus);
-		k++;
-	}
-	cusparseDestroy(cusparseHandle);
-	cudaDeviceReset();
-	FREE_DEVICE_STACK
-	return k;
-}
diff --git a/src/cg_colver_gpu.h b/src/cg_colver_gpu.h
deleted file mode 100644
index 9ba1e10..0000000
--- a/src/cg_colver_gpu.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/*Contains prototypes for gpu_cg_solver functions.*/
-
-#ifndef CG_SOLVER_GPU_H
-#define CG_SOLVER_GPU_H
-#include "utils.h"
-#include "gpu_utils.h"
-
-int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);
-
-#endif
\ No newline at end of file
diff --git a/src/ckernels.cu b/src/ckernels.cu
deleted file mode 100644
index 2fb3b0d..0000000
--- a/src/ckernels.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*Contains implementation of custom kernels for CUDA devices.*/
-
-#include "ckernels.h"
-#include <cublas_v2.h>
-
-const char* cublasGetErrorString(cublasStatus_t status)
-{
-	switch(status)
-	{
-		case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-		case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-		case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-		case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-		case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-		case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-		case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-		case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-	}
-	return "unknown error";
-}
-
-__global__ void sDdiv(double *res, double *divided, double *divider) {
-	/*Division of scalar elements on a single CUDA thread*/
-	if (threadIdx.x == 0) {
-		res[0] = divided[0] / divider[0];
-	}
-}
-
-__global__ void axpy(int num_elements, double alpha, double *x, double *y) {
-	/*Perform computations of AXPY operations: y[i] = y[i] + alpha * x[i]*/
-	unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-	if (i < num_elements) {
-		y[i] = y[i] + alpha * x[i];
-	}
-}
-
diff --git a/src/ckernels.h b/src/ckernels.h
deleted file mode 100644
index d4dbb2e..0000000
--- a/src/ckernels.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/*Contains prototypes of custom kernels for CUDA devices.*/
-
-#include <cublas_v2.h>
-
-const char* cublasGetErrorString(cublasStatus_t status);
-__global__ void sDdiv(double *res, double *divided, double *divider);
-__global__ void axpy(int num_elements, double alpha, double *x, double *y);
diff --git a/src/conjugate_gradients.c b/src/conjugate_gradients.c
new file mode 100755
index 0000000..763a690
--- /dev/null
+++ b/src/conjugate_gradients.c
@@ -0,0 +1,73 @@
+/* File contains implementation of Conjugate Gradient method using Intel MKL library and nVidia CUDA.
+ *
+ * Author: Pawel Stoworowicz
+ *
+ * */
+
+#define FREE_STACK_COMMON \
+    free(matrix);\
+    free(input_cfg);\
+
+#define FREE_STACK_ALL \
+    FREE_STACK_COMMON\
+    free(res_vec);\
+    free(b_vec);\
+    free(x_vec);
+
+//mkl_free(res_vec);\
+	mkl_free(b_vec);\
+	mkl_free(x_vec);
+
+
+#include <stdlib.h>
+#include <omp.h>
+#include "stdio.h"
+#include "misc/utils.h"
+#include "mkl.h"
+#include "misc/logger.h"
+
+
+int main(int argc, char **argv) {
+    /*Main function - thanks captain obvious!*/
+    logger_clean();
+    logger(LEVEL_INFO, "%s", "Conjugate Gradient solver using MKL.");
+    InputConfig *input_cfg;
+    input_cfg = arg_parser(argc, argv);
+    logger(LEVEL_INFO, "Num of threads: %d", input_cfg->num_of_threads);
+    omp_set_num_threads(input_cfg->num_of_threads);
+    MatrixInt *matrix;
+    matrix = getMatrixCRSInt(input_cfg);
+    if (matrix == NULL) {
+        logger(LEVEL_ERROR, "%s", "ERROR - could not read matrix, exiting.");
+        FREE_STACK_COMMON;
+        exit(1);
+    }
+    logger(LEVEL_DEBUG, "%s", "Matrix parameters:");
+    logger(LEVEL_DEBUG, "Size = %d, non-zero elements = %d", (int) matrix->size, (int) matrix->non_zero);
+
+    double *x_vec, *b_vec, *res_vec;
+    double t_start, t_stop;
+    /*
+    x_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
+    b_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
+    res_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
+    */
+
+    x_vec = (double *) malloc(matrix->size * sizeof(double));
+    b_vec = (double *) malloc(matrix->size * sizeof(double));
+    res_vec = (double *) malloc(matrix->size * sizeof(double));
+
+    int total_iter = 0;
+    for (int i = 0; i < matrix->size; i++) {
+        x_vec[i] = 1.0;
+        b_vec[i] = 0.0;
+        res_vec[i] = 0.0;
+    }
+    t_start = get_time();
+    total_iter = launch_solver(matrix, x_vec, b_vec, res_vec, input_cfg);
+    t_stop = get_time();
+    logger(LEVEL_INFO, "Conjugate gradient method finished within %.3f [secs] in total %d iterations.", t_stop - t_start, total_iter);
+
+    FREE_STACK_ALL;
+    exit(0);
+}
diff --git a/src/misc/logger.h b/src/misc/logger.h
new file mode 100755
index 0000000..52fc570
--- /dev/null
+++ b/src/misc/logger.h
@@ -0,0 +1,79 @@
+/* Simple logger code. Nothing fancy here.
+ *
+ * Author: Pawel Stoworowicz
+ *
+ * */
+
+#ifndef LOGGER_H
+#define LOGGER_H
+
+#ifdef DEBUG
+#define DDEBUG 1
+#else
+#define DDEBUG 0
+#endif
+
+#define LEVEL_DEBUG -1
+#define LEVEL_INFO 0
+#define LEVEL_WARNING 1
+#define LEVEL_ERROR 2
+
+#ifndef DEBUG_FLAG
+#define DEBUG_FLAG 0
+#endif
+
+#include <time.h>
+#include <stdlib.h>
+#include "utils.h"
+#include <string.h>
+
+#define _LOG_FIL_NAME "execution.log"
+
+#define logger_clean()\
+    FILE *logger_fp;\
+    logger_fp = fopen(_LOG_FIL_NAME, "w+");\
+    fclose(logger_fp);\
+
+#define log_to_file(buffer, args...){\
+    FILE *logger_fp;\
+    logger_fp = fopen(_LOG_FIL_NAME, "a");\
+    fprintf(logger_fp, buffer, ##args);\
+    fclose(logger_fp);\
+}
+
+/*GCC prints warning regarding args... notation - works fine for CLang!*/
+#define GCC_COMPILER (defined(__GNUC__) && !defined(__clang__))
+#if GCC_COMPILER
+#pragma GCC diagnostic ignored "-Wformat-extra-args"
+#endif
+
+#define _LOG_DATA_STDERR(buffer, fmt, args...){\
+    strcat(buffer, fmt); strcat(buffer, "\n"); fprintf(stderr, buffer, ##args); log_to_file(buffer, ##args);\
+    }
+
+#define _LOG_DATA_STDOUT(buffer, fmt, args...){\
+    strcat(buffer, fmt); strcat(buffer, "\n"); fprintf(stdout, buffer, ##args); log_to_file(buffer, ##args);\
+    }
+
+
+#define logger(log_level, fmt, args...){\
+    char buffer[1024*sizeof(char)];\
+    struct tm* tm_info;\
+    time_t timer;\
+    time(&timer);\
+    tm_info = localtime(&timer);\
+    strftime(buffer, 1024*sizeof(char), "%T", tm_info);\
+    switch(log_level){\
+        case LEVEL_INFO: strcat(buffer, " [INFO] "); _LOG_DATA_STDOUT(buffer, fmt, ##args) break;\
+        case LEVEL_WARNING: strcat(buffer, " [WARNING] "); _LOG_DATA_STDOUT(buffer, fmt, ##args) break;\
+        case LEVEL_ERROR: strcat(buffer, " ***[ERROR]*** "); _LOG_DATA_STDERR(buffer, fmt, ##args) break;\
+        case LEVEL_DEBUG: if(DDEBUG){strcat(buffer, " [DEBUG] "); _LOG_DATA_STDOUT(buffer, fmt, ##args);\
+        break;}\
+            else{break;}\
+        default: strcat(buffer, " [ERROR] Unknown level: "); break;\
+    }/*switch*/\
+}/*logger*/
+#if GCC_COMPILER
+#pragma GCC diagnostic pop
+#endif /*pragma*/
+#endif /*LOGGER_H*/
diff --git a/src/misc/math_functions.c b/src/misc/math_functions.c
new file mode 100755
index 0000000..a410239
--- /dev/null
+++ b/src/misc/math_functions.c
@@ -0,0 +1,14 @@
+/*Contains implementations of custom math functions*/
+
+#include <math.h>
+#include <omp.h>
+#include "math_functions.h"
+
+double euclidean_norm(int size, double *vector){
+    double res = 0.0;
+    #pragma omp parallel for
+    for (int i = 0; i < size; i++){
+        res += pow(vector[i], 2);
+    }
+    return sqrt(res);
+}
diff --git a/src/misc/math_functions.h b/src/misc/math_functions.h
new file mode 100755
index 0000000..873cf55
--- /dev/null
+++ b/src/misc/math_functions.h
@@ -0,0 +1,3 @@
+/*Contains prototypes for custom math functions*/
+
+double euclidean_norm(int size, double *vector);
diff --git a/src/utils.c b/src/misc/utils.c
old mode 100644
new mode 100755
similarity index 57%
rename from src/utils.c
rename to src/misc/utils.c
index a467e70..44b1c7f
--- a/src/utils.c
+++ b/src/misc/utils.c
@@ -8,13 +8,14 @@
 
 #include <string.h>
 #include <stdio.h>
-#include "cg_solver.h"
-#include "pcg_solver.h"
+#include "../CPU/cg_solver.h"
+#include "../CPU/pcg_solver.h"
 #include "utils.h"
 #include "mkl.h"
-#include "gpu_utils.h"
-#include "cg_colver_gpu.h"
+#include "../GPU/gpu_utils.h"
+#include "../GPU/cg_solver_gpu.h"
 #include <cuda_runtime.h>
+#include "logger.h"
 
 double get_time(){
 	/* Return ``double`` time stamp for execution time measurements. Implementation depends on OS. */
@@ -43,7 +44,7 @@ Matrix* getMatrixCRS(InputConfig *input_cfg){
 	FILE *pf;
 	pf = fopen(fil_name, "r");
 	if(pf == NULL){
-		printf("Can't open the file: %s", fil_name);
+		logger(LEVEL_ERROR, "Can't open the file: %s", fil_name);
 	}
 
 	double v;
@@ -57,7 +58,7 @@ Matrix* getMatrixCRS(InputConfig *input_cfg){
 	matrix->size = matsize;
 	matrix->non_zero = non_zero;
 	matrix->I_column = (MKL_INT *)mkl_malloc((matsize + 1) * sizeof(MKL_INT), 64);
-	matrix->J_row = (MKL_INT *)mkl_malloc(non_zero * sizeof(MKL_INT), 64 );
+	matrix->J_row = (MKL_INT *)mkl_malloc(non_zero * sizeof(MKL_INT), 64);
 	matrix->val = (double *)mkl_malloc(non_zero * sizeof(double), 64);
 
 	int n = 0;
@@ -80,6 +81,52 @@ Matrix* getMatrixCRS(InputConfig *input_cfg){
 	return matrix;
 }
 
+MatrixInt* getMatrixCRSInt(InputConfig *input_cfg){
+    /* Load Matrix stored in CSR format from file.*/
+    char *fil_name;
+    fil_name = input_cfg->filename;
+    if (fil_name == NULL)
+        return NULL;
+    FILE *pf;
+    pf = fopen(fil_name, "r");
+    if(pf == NULL){
+        logger(LEVEL_ERROR, "Can't open the file: %s", fil_name);
+    }
+
+    double v;
+    int c, non_zero, inx;
+    int matsize, max_rows;
+
+    // Load first line of file with information about size and non zero element in matrix
+    fscanf(pf,"%d %d %d", &matsize, &non_zero, &max_rows);
+
+    MatrixInt *matrix = (MatrixInt *)malloc(sizeof(MatrixInt));
+    matrix->size = matsize;
+    matrix->non_zero = non_zero;
+    matrix->I_column = (int *)malloc((matsize + 1) * sizeof(int));
+    matrix->J_row = (int *)malloc(non_zero * sizeof(int));
+    matrix->val = (double *)malloc(non_zero * sizeof(double));
+
+    int n = 0;
+    while( ! feof(pf)){
+        if(n < matsize + 1){
+            fscanf(pf, "%le %d %d", &v, &c, &inx);
+            matrix->val[n] = v;
+            matrix->J_row[n] = c;
+            matrix->I_column[n] = inx;
+        }
+        else{
+            fscanf(pf, "%le %d", &v, &c);
+            matrix->val[n] = v;
+            matrix->J_row[n] = c;
+        }
+        n++;
+    }
+    fclose(pf);
+    pf = NULL;
+    return matrix;
+}
+
 InputConfig* arg_parser(int argc, char **argv){
 	/*Parse argument line*/
 	InputConfig *input_cfg = (InputConfig *)malloc(sizeof(InputConfig));
@@ -109,24 +156,24 @@ InputConfig* arg_parser(int argc, char **argv){
 	return input_cfg;
 }
 
-int launch_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg){
+int launch_solver(MatrixInt *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg){
 	/*Launch proper solver based on input config data.*/
 	GPU_data *gpu_data;
 	gpu_data = get_gpu_devices_data();
 
 	int total_iter = 0;
 	if (gpu_data->devices_number > 0 && input_cfg->gpu == 1){
-		printf("Launching GPU CG.\n");
-		printf("Number of CUDA devices: %d\n", gpu_data->devices_number);
+		logger(LEVEL_INFO, "%s", "Launching GPU CG.");
+		logger(LEVEL_INFO, "Number of CUDA devices: %d", gpu_data->devices_number);
 		cudaDeviceReset();
 		total_iter = gpu_conjugate_gradient_solver(matrix, x_vec, b_vec, res_vec, gpu_data);
 	} else {
 		if (input_cfg->preconditioner == NULL){
-			printf("Launching CPU CG.\n");
-			total_iter = conjugate_gradient_solver(matrix, x_vec, b_vec, res_vec);
+			logger(LEVEL_INFO, "%s", "Launching CPU CG.");
+			//total_iter = conjugate_gradient_solver(matrix, x_vec, b_vec, res_vec);
 		} else {
-			printf("Launching CPU PCG with %s preconditioner.\n", input_cfg->preconditioner);
-			total_iter = pcg_solver(matrix, x_vec, b_vec, res_vec, input_cfg->preconditioner);
+			logger(LEVEL_INFO, "Launching CPU PCG with %s preconditioner.", input_cfg->preconditioner);
+			//total_iter = pcg_solver(matrix, x_vec, b_vec, res_vec, input_cfg->preconditioner);
 		}
 	}
 	free(gpu_data);
diff --git a/src/utils.h b/src/misc/utils.h
old mode 100644
new mode 100755
similarity index 61%
rename from src/utils.h
rename to src/misc/utils.h
index 2f1a169..ae7fb6d
--- a/src/utils.h
+++ b/src/misc/utils.h
@@ -14,6 +14,14 @@ typedef struct {
 	double *val;
 }Matrix;
 
+typedef struct {
+    int size;
+    int non_zero;
+    int *I_column;
+    int *J_row;
+    double *val;
+}MatrixInt;
+
 typedef struct{
 	char *filename;
 	int num_of_threads;
@@ -23,7 +31,10 @@ typedef struct{
 
 Matrix* getMatrixCRS(InputConfig *input_cfg);
 
+MatrixInt* getMatrixCRSInt(InputConfig *input_cfg);
+
 InputConfig* arg_parser(int argc, char **argv);
 
-int launch_solver(Matrix *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg);
+int launch_solver(MatrixInt *matrix, double *x_vec, double *b_vec, double *res_vec, InputConfig *input_cfg);
+
 #endif
\ No newline at end of file
diff --git a/src/solver.c b/src/solver.c
deleted file mode 100644
index 395c0db..0000000
--- a/src/solver.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* File contains implementation of Conjugate Gradient method using Intel MKL library.
- *
- * Author: Pawel Stoworowicz
- * Contact: stoworow@gmail.com
- *
- * */
-
-#define FREE_STACK_COMMON \
-	free(matrix);\
-	free(input_cfg);\
-
-#define FREE_STACK_ALL \
-	FREE_STACK_COMMON\
-	mkl_free(res_vec);\
-	mkl_free(b_vec);\
-	mkl_free(x_vec)
-
-#include <stdlib.h>
-#include <omp.h>
-#include "stdio.h"
-#include "utils.h"
-#include "mkl.h"
-
-
-int main(int argc, char** argv){
-	/*Main function - thanks captain obvious!*/
-	printf("Conjugate Gradient solver using MKL.\n");
-	InputConfig *input_cfg;
-	input_cfg = arg_parser(argc, argv);
-	printf("Num of threads: %d\n", input_cfg->num_of_threads);
-	omp_set_num_threads(input_cfg->num_of_threads);
-	Matrix *matrix;
-	matrix = getMatrixCRS(input_cfg);
-	if (matrix == NULL){
-		printf("ERROR - could not read matrix, exiting.\n");
-		FREE_STACK_COMMON;
-		exit(1);
-	}
-	printf("Matrix parameters:\n");
-	printf("Size = %d, non-zero elements = %d\n", (int)matrix->size, (int)matrix->non_zero);
-
-	double *x_vec, *b_vec, *res_vec;
-	double t_start, t_stop;
-	x_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
-	b_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
-	res_vec = (double *)mkl_malloc(matrix->size * sizeof(double), 64);
-	int total_iter = 0;
-	for (int i = 0; i < matrix->size; i++) {
-		x_vec[i] = 1;
-		b_vec[i] = 0;
-		res_vec[i] = 0;
-	}
-	t_start = get_time();
-	total_iter = launch_solver(matrix, x_vec, b_vec, res_vec, input_cfg);
-	t_stop = get_time();
-	printf("Conjugate gradient method finished within %.3f [secs] in total %d iterations.\n", t_stop - t_start, total_iter);
-
-	FREE_STACK_ALL;
-	exit(0);
-}