#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BLOCKSIZE 	8 	// Size of Block
#define FILENAMEMAT	"NewA.txt"	//Name of file for matrix
#define FILENAMEVECIN	"Newx.txt"	//Name of file for vector
#define FILENAMEVECOUT	"Ax.txt"	//Name of file for product vector
#define FILENAMEVECOUTGPU	"AX.txt"	//Name of file for GPU product vector
#define NIter		100	//Number of runs for timing

//Kernel definition
__global__ void GPUSPMV1(int * StartPoints_d, int * ColIndices_d, float * MatVals_d, float * x_d, float * Ax_d){
 int i, j;
 i=threadIdx.x + BLOCKSIZE*blockIdx.x;
 float sum=0.0f;
  for(j=StartPoints_d[i];j<StartPoints_d[i+1];j++){
   sum+=MatVals_d[j]*x_d[ColIndices_d[j]];
  }
  Ax_d[i]=sum;
 }
 
int main(void)
{
//Reading in files
int 	n, m, nz;           		//dimensions then total and row count of non-zeros
int 	*RowLengths, *StartPoints, *ColIndices; 	//storage for row lengths, StartPoints and active columns.
int 	*RowLengths_d, *StartPoints_d, *ColIndices_d; 	//Device storage for row lengths, StartPoints and active columns.
float 	*MatVals, *x, *Ax;                 	//Storage for matrix entries and vectors.
float 	*MatVals_d, *x_d, *Ax_d;                 	//Device
float 	sum;			//accumulator for matrix-vector product
timespec	t0, t1;			//timings for CPU

FILE *fp;
int i, j, run;

fp=fopen(FILENAMEMAT,"r");
fscanf(fp,"%i",&n);
fscanf(fp,"%i",&m);
fscanf(fp,"%i",&nz);
//Allocating Host memory for matrix
RowLengths   = (int *)   malloc(sizeof(int)*n);
StartPoints  = (int *)   malloc(sizeof(int)*(n+1));
ColIndices   = (int *)   malloc(sizeof(int)*nz);
MatVals      = (float *) malloc(sizeof(float)*nz);
//Reading matrix from file
StartPoints[0]=0;
for(i=0;i<n;i++){
 fscanf(fp,"%i",RowLengths+i);
 StartPoints[i+1]=StartPoints[i]+RowLengths[i];
 for(j=StartPoints[i];j<StartPoints[i+1];j++){
  fscanf(fp,"%i", ColIndices+j);
  fscanf(fp,"%f", MatVals+j);
  } 
 }
fclose(fp);
//Allocating space for and copying the matrix to the GPU
cudaMalloc((void**)&RowLengths_d, sizeof(int)*n);
cudaMemcpy(RowLengths_d, RowLengths, sizeof(int)*n, cudaMemcpyHostToDevice);
cudaMalloc((void**)&StartPoints_d,  sizeof(int)*(n+1));
cudaMemcpy(StartPoints_d, StartPoints, sizeof(int)*(n+1), cudaMemcpyHostToDevice);
cudaMalloc((void**)&ColIndices_d,sizeof(int)*nz);
cudaMemcpy(ColIndices_d, ColIndices, sizeof(int)*nz, cudaMemcpyHostToDevice);
cudaMalloc((void**)&MatVals_d, sizeof(float)*nz);
cudaMemcpy(MatVals_d, MatVals, sizeof(float)*nz, cudaMemcpyHostToDevice);

//Allocating host memory for and reading the input vector
x = (float *) malloc(sizeof(float)*n);
fp=fopen(FILENAMEVECIN,"r");
for(i=0;i<n;i++){
 fscanf(fp, "%f", x+i);
 }
fclose(fp);
//Allocating space for and copying the input vector to the GPU
cudaMalloc((void**)&x_d, sizeof(float)*n);
cudaMemcpy(x_d, x, sizeof(float)*n, cudaMemcpyHostToDevice);


//Allocating host memory for the product vector
Ax = (float *) malloc(sizeof(float)*n);

 //Allocating Device memory for product vector 
 cudaMalloc((void**)&Ax_d, sizeof(float)*n);
 
//Computing the sparse product on the GPU
//Code outline for today  
for(run=0;run<NIter;run++){
 GPUSPMV1<<<n/BLOCKSIZE,BLOCKSIZE>>>(StartPoints_d, ColIndices_d, MatVals_d, x_d, Ax_d);
 }
//Retrieving the device product vector
cudaMemcpy(Ax, Ax_d, sizeof(float)*n, cudaMemcpyDeviceToHost);
 
//Dumping the GPU answer to a file
fp=fopen(FILENAMEVECOUTGPU,"w");
for(i=0;i<n;i++){
 fprintf(fp, "%f \n", Ax[i]);
 }
fclose(fp);
 
//Form sparse product on CPU
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t0);

for(run=0;run<NIter;run++){
 for(i=0;i<n;i++){
  sum=0.0;
   for(j=StartPoints[i];j<StartPoints[i+1];j++){
   sum+=MatVals[j]*x[ColIndices[j]];
   }
   Ax[i]=sum;
  }
 }
 
 clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &t1);
 printf("time is  are %lg \n", ((1.0e-9)*(t1.tv_nsec-t0.tv_nsec))/NIter);
 
 //Dumping the CPU answer to a file
fp=fopen(FILENAMEVECOUT,"w");
for(i=0;i<n;i++){
 fprintf(fp, "%f \n", Ax[i]);
 }
fclose(fp);

//Clearing memory
free(RowLengths);	cudaFree(RowLengths_d);
free(StartPoints);	cudaFree(StartPoints_d);
free(ColIndices);	cudaFree(ColIndices_d);
free(MatVals);		cudaFree(MatVals_d);
free(x);		cudaFree(x_d);
free(Ax);		cudaFree(Ax_d);


return 0;
}