tali

2D Reduce Kernel does not work !!!!!

Discussion created by tali on Jan 9, 2010
Latest reply on Jan 29, 2010 by tali
Strange error in 2D reduce kernel !!!!!

Hi 

I am implementing a sparse vector multiply routine to do  a simple y=Ax.

Due to size limitations of 8192 in 1D streams, i have to extend the code to 2D streams. 

This is where i get the problem.

I get my final y-vector by reducing the data in a tempStream<>.

 

The problem is that the result in my reduce data is sometimes corrupted for certain sizes of matrix.

However I have tried to manually reduce the stream on the CPU in a simple for-loop and then the result if OK. 

I cannot belive that the Brook+ 2D kernel has this strange problem. 

I have attached the code, any help from any one will be of great help as i am stucked with it for days.............

 

 

//--------------------------------------------------------------------- The KERNELS //--------------------------------------------------------------------- kernel void spmv_gather_2D(int C[][], float x[][], int Max_Stream_Width, out float result<>) { int2 i = instance().xy; // result<> will be a 2D stream of size > Max_Stream_Width // C is also a 2D stream int k = C[i.y][i.x]; // index into C // this will return some index greater than Max_Stream_Width // so int2 k2d; // to index into the 2d X-stream k2d.x = k % Max_Stream_Width; k2d.y = k / Max_Stream_Width; // finally, the gather result result = x[k2d.y][k2d.x]; } kernel void spmv_mul_2D(float a<>, float x<>, out float c<>) { c = a * x; } reduce void spmv_reduce_2D(float temp<>, reduce float y<>) { y += temp; } //--------------------------------------------------------------------- main.cpp //--------------------------------------------------------------------- #include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #include "spmv.h" //include the brook generated header //#include "brookgenfiles/spmv.h" //include the brook generated header //------------------------------------------------------------------------------------------- // prototypes //------------------------------------------------------------------------------------------- void printSpM(int N, int *R, int *C, float *A); void sprand(int nrows,int ncols,int nnz, int R[],int C[],float A[], int*); void SpMV(int nrows, int ncols, int *R, int *C, float *A, float *x, float *y); void saveToFile(int nrows, int*R, int*C, float*A, float*y); void prepareTestData(int nnz, int *R, int * C, float* A); void printMatrix(float* A, int* C, int*R, int N, int M); void outputGPUData(float* A, int* C, int*R, float *x, float* newA, int* newC, float *newX, int N, int nMaxWidth ); void prepareGPUData(float *A, int *C, int* R, float *newA, int *newC, int nnz, int nrows, int iMaxWidth ); // stream creation int* create2DFloatStream(::brook::Stream<float>** stream, int MAX_STREAM_SIZE, float** data, int dataLen); int* create2DFloatStream(::brook::Stream<float>** stream, int MAX_STREAM_SIZE, int streamLen); int* create2DIntStream(::brook::Stream<int>** stream, int MAX_STREAM_SIZE, int** data, int dataLen); int* create2DIntStream(::brook::Stream<int>** stream, int MAX_STREAM_SIZE, int streamLen); //-------------------------------------------------------------------------------------------- // Main //-------------------------------------------------------------------------------------------- int main(){ //real data // int ncols = 10000; // int nrows = 10000; // int nnz = (nrows * ncols)*((25.0f)/100.0f) ; //25% sparsity 25 000 000 // int MAX_STREAM_SIZE = 8192; // set it to 8192 = 8*1024 = 8 Kb // Error Data 1 int ncols = 100; int nrows = 100; int nnz = 7500; //75% sparsity int MAX_STREAM_SIZE = ncols/2; // set it to 8192 = 8*1024 = 8 Kb // Test Data 1 // int ncols = 5; // int nrows = 5; // int nnz = 11; //25% sparsity // int MAX_STREAM_SIZE = 4; // set it to 8192 = 8*1024 = 8 Kb //Sparse Matrix Vectors- Single Precision int N, *R, *C; float *A, *x, *y; //timers clock_t start, finish; //allocate memory N = nrows*ncols; R = (int*) malloc((nrows+1) * sizeof (int)); //Equal to no. of rows +1 C = (int*) malloc(nnz * sizeof (int)); //Equal to no. of non-zero elements A = (float*) malloc(nnz * sizeof (float)); //Equal to no. of non-zero elements x = (float*) malloc(ncols * sizeof (float)); //multiplier vector x y = (float*) malloc(nrows * sizeof (float)); //resultant vector //initialise vector x to all ones for(int i=0; i<ncols; i++){ x[i] = (float)(i+1); } //---------------------------------------------------------------------------------------- // Create a random sparse matrix //---------------------------------------------------------------------------------------- // int MAX_ROW_WIDTH = 3; //max no. of non-zeros in a single row sprand(nrows,ncols, nnz, R, C, A, &MAX_ROW_WIDTH); // prepareTestData(nnz, R, C, A); // printMatrix(A, C, R, nrows, ncols); //---------------------------------------------------------------------------------------- // CPU EXECUTION //---------------------------------------------------------------------------------------- //measure time int trials = 100; start =clock(); for(int i=0; i< trials; i++){ //do: y = Ax SpMV(nrows, ncols, R, C, A, x, y); } finish =clock(); double timeA= (double)(finish-start)*1000.0 / CLOCKS_PER_SEC; trials = 200; start =clock(); for(int i=0; i< trials; i++){ //do: y = Ax SpMV(nrows, ncols, R, C, A, x, y); } finish =clock(); double timeB= (double)(finish-start)*1000.0 / CLOCKS_PER_SEC; double runtime = (timeB-timeA)/100; printf("CPU TIME: %.2f milliseconds \n",runtime); //----------------------------------------------------------------------------------------- // Change Max. Stream Width allowed from 8192 to some multiple // of MAX_ROW_WIDTH of the matrix //----------------------------------------------------------------------------------------- int multiple = MAX_STREAM_SIZE / MAX_ROW_WIDTH; if(multiple == 0) multiple =1; // never have it zero MAX_STREAM_SIZE = multiple * MAX_ROW_WIDTH; printf("\nMAX_ROW_WIDTH : %d \n",MAX_ROW_WIDTH); printf("max. 1D stream size: %d \n", MAX_STREAM_SIZE); //----------------------------------------------------------------------------------- // Prepare the Padded GPU Data //----------------------------------------------------------------------------------- // prepare the new Compressed Row data for the streams unsigned int iStreamLen = (MAX_ROW_WIDTH * nrows); unsigned int iXLen = (ncols); unsigned int iResultLen = (nrows); // array data to be copied over to the streams float *newA = (float*) malloc( iStreamLen * sizeof (float)); int *newC = (int*) malloc( iStreamLen * sizeof (int)); // prepare GPU data by padding in zeros for both A, C arrays. prepareGPUData(A, C, R, newA, newC, nnz, nrows, MAX_ROW_WIDTH ); // create a copy of the original x[] vector - // This vector will be passed down to the 2D stream X // and copied data will be lost and replaced. // We need to save x[] for testign later. //float *X_old = (float*) malloc( ncols * sizeof (float)); //for(int i=0; i<ncols; i++){ // X_old[i] = x[i]; //} //----------------------------------------------------------------------------------- // 2D Stream Declarartions //----------------------------------------------------------------------------------- // Stream A //int* dim; //::brook::Stream<float> *AStream; //dim = create2DFloatStream(&AStream, MAX_STREAM_SIZE, &newA /*data*/, iStreamLen/*data length*/); //printf("\nstream A< %d, %d > \n",dim[0], dim[1]); // Stream C //::brook::Stream<int> *CStream; //dim = create2DIntStream(&CStream, MAX_STREAM_SIZE, &newC /*data*/, iStreamLen/*data length*/); //printf("stream C< %d, %d > \n",dim[0], dim[1]); // Stream Temp //::brook::Stream<float> *tempStream; //dim = create2DFloatStream(&tempStream, MAX_STREAM_SIZE, iStreamLen/*stream length*/); //printf("stream Temp< %d, %d > \n",dim[0], dim[1]); //int temp_W = dim[0]; //int temp_H = dim[1]; // Stream X // ::brook::Stream<float> *xStream; // dim = create2DFloatStream(&xStream, MAX_STREAM_SIZE, iStreamLen/*stream length*/); // printf("stream X< %d, %d > \n",dim[0], dim[1]); // float *newX = (float*) malloc( dim[0]*dim[1] * sizeof (float)); // int xLen = dim[0]*dim[1]; //---------------------------------- //Streams A, C , X , Temp //---------------------------------- unsigned int height = iStreamLen /MAX_STREAM_SIZE; if(iStreamLen%MAX_STREAM_SIZE > 0) height++; unsigned int width = MAX_STREAM_SIZE; // 2D-stream declaraion unsigned int dim[2] = {width, height}; ::brook::Stream<float> AStream(2/*rank*/, dim /*dimension*/); ::brook::Stream<int> CStream(2/*rank*/, dim /*dimension*/); ::brook::Stream<float> tempStream(2/*rank*/, dim /*dimension*/); ::brook::Stream<float> xStream(2/*rank*/, dim /*dimension*/); //save dimensions of temp stream unsigned int temp_W = width; unsigned int temp_H = height; // print data printf("\n Stream A<%d, %d> ", dim[0], dim[1]); printf("\n Stream C<%d, %d> ", dim[0], dim[1]); printf("\n Stream Temp<%d, %d> ", dim[0], dim[1]); printf("\n Stream X<%d, %d> ", dim[0], dim[1]); float* A_data = (float*) malloc(width* height* sizeof (float) ); int* C_data = (int*) malloc(width* height* sizeof (int) ); for(int i=0; i<(width* height) ; i++) { if(i< iStreamLen){ A_data[i] = newA[i]; C_data[i] = newC[i]; } else{ A_data[i] = 0; C_data[i] = 0; } } // free the old array data free(newA); free(newC); //---------------------------------- //Streams xOLD - old data //---------------------------------- height = ncols /MAX_STREAM_SIZE; if(ncols%MAX_STREAM_SIZE > 0) height++; width = MAX_STREAM_SIZE; // 2D-stream declaraion unsigned int dimX[2] = {width, height}; ::brook::Stream<float> xOldStream(2/*rank*/, dimX /*dimension*/); printf("\n Stream xOLD<%d, %d> ", dimX[0], dimX[1]); float* x_data = (float*) malloc(width* height* sizeof (float) ); for(int i=0; i<(width* height) ; i++) { if(i< ncols){ x_data[i] = x[i]; } else{ x_data[i] = 0; } } // Stream XOLD - contains the original x-vector as in the CPU code // ::brook::Stream<float> *xOldStream; // dim = create2DFloatStream(&xOldStream, MAX_STREAM_SIZE, &X_old /*data*/, ncols/*data length*/); // printf("stream X_old< %d, %d > \n",dim[0], dim[1]); //---------------------------------- //Stream Y //---------------------------------- int Y_w = temp_W / MAX_ROW_WIDTH; // calc. width of the reduce stream. int Y_h = temp_H; // same as the temp stream's which will be reduced unsigned int dimY[2]={Y_w, Y_h}; ::brook::Stream<float> yStream(2, dimY); //_Y = new ::brook::Stream<float>(2, dimY); printf("\n Stream Y< %d, %d > \n",dimY[0], dimY[1]); // Initialise the Y-Stream to all zeros float *newY = (float*) malloc( Y_w * Y_h * sizeof(float) ); for(int i =0; i<(Y_w*Y_h) ; i++ ){ newY[i] =0; } //---------------------------------------------------------- // GPU EXECUTION //---------------------------------------------------------- // read data into GPU streams AStream.read(A_data); CStream.read(C_data); xOldStream.read(x_data); yStream.read(newY); printf("\nCalling the kernels....",dimY[0], dimY[1]); trials = 100; start =clock(); for(int i=0; i< trials; i++){ // KERNEL: Gather //spmv_gather_2D(*(CStream), *(xOldStream), MAX_STREAM_SIZE ,*(xStream) ); spmv_gather_2D(CStream, xOldStream, MAX_STREAM_SIZE ,xStream ); // KERNEL: Multiplication //spmv_mul_2D( *(AStream), *(xStream), *(tempStream) ); spmv_mul_2D( AStream, xStream, tempStream ); // KERNEL: REDUCE //spmv_reduce_2D( *(tempStream), yStream ); spmv_reduce_2D( tempStream, yStream ); // download data yStream.write(newY); } finish =clock(); timeA= (double)(finish-start)*1000.0 / CLOCKS_PER_SEC; trials = 200; start =clock(); for(int i=0; i< trials; i++){ // KERNEL: Gather //spmv_gather_2D(*(CStream), *(xOldStream), MAX_STREAM_SIZE ,*(xStream) ); spmv_gather_2D(CStream, xOldStream, MAX_STREAM_SIZE ,xStream ); // KERNEL: Multiplication //spmv_mul_2D( *(AStream), *(xStream), *(tempStream) ); spmv_mul_2D( AStream, xStream, tempStream ); // KERNEL: REDUCE //spmv_reduce_2D( *(tempStream), yStream ); spmv_reduce_2D( tempStream, yStream ); // download data yStream.write(newY); } finish =clock(); timeB= (double)(finish-start)*1000.0 / CLOCKS_PER_SEC; runtime = (timeB-timeA)/100; printf("\nGPU run-time: %.2f milliseconds \n",runtime); //---------------------------------------------------------- // Error Checking //---------------------------------------------------------- // ERROR in Reduce Kernel float error = 0; for(int i=0; i<(nrows); i++){ float e = (y[i] - newY[i]); if(e<0) // make it positive e = e* (-1); error += (e); } printf("\nTotal error in Reduce-kernel = %.f \n", error ); // Compare data in stream A with original A // for(int i=0; i< nrows; i++){ // for(int j=R[i], offset=0; j<R[i+1]; j++, offset++){ // int indNewA = i* MAX_ROW_WIDTH + offset; // printf("\n A[%d, %d]=%.f , newA[%d, %d]=%.f \n", i, C[j], A[j], i, offset, newA[indNewA]); // } // } // Do Manual Reduce float *temp = (float*) malloc( temp_W * temp_H * sizeof (float)); tempStream.write(temp); // Manual Reduce for(int i=0; i< nrows; i++){ newY[i] = 0; for(int k=0; k<MAX_ROW_WIDTH; k++){ int indNewA = i* MAX_ROW_WIDTH + k; newY[i] += temp[indNewA]; } } // Error : Manual Reduce error = 0; for(int i=0; i<(nrows); i++){ float e = (y[i] - newY[i]); if(e<0) // make it positive e = e* (-1); error += (e); } printf("\nError in Manual Reduce= %.f \n", error ); /* // Error in Gather Kernel xStream.write(newX); float errGather =0; //error in gather step for(int i=0; i<xLen; i++){ //printf("X[ %d ]=%.2f \n",i,newX[i]-x[newC[i]]); errGather += (newX[i]-x[newC[i]]); } printf("\nError in gather step=%.2f \n",errGather); // Error in Multiplication Kernel error = 0; for(int i=0; i< nrows; i++){ for(int j=R[i], offset=0; j<R[i+1]; j++, offset++){ int indNewA = i* MAX_ROW_WIDTH + offset; //printf("\n A[%d, %d]=%.f , newA[%d, %d]=%.f \n", i, C[j], A[j], i, offset, newA[indNewA]); //printf("\n temp[%d, %d]=%.f , newA*newX[%d, %d]=%.f \n", i, C[j], A[j]*x[C[j]], i, offset, newA[indNewA]*newX[indNewA]); //printf("\n temp[%d, %d]=%.f , temp_stream[%d, %d]=%.f \n", i, C[j], A[j]*x[C[j]], i, offset, temp[indNewA]); //printf("\n temp_error[%d, %d]=%.f \n", i, C[j], ( (A[j]*x[C[j]]) - temp[indNewA]) ); error += ( (A[j]*x[C[j]]) - temp[indNewA] ); } } free(temp); printf("\nTotal error in the Mul-kernel = %.f \n", error ); */ if(yStream.error()) printf("Y.errorLog()= %s \n", yStream.errorLog() ); //---------------------------------------------------------- // deallocate memory //---------------------------------------------------------- free(A_data); free(C_data); free(newY); free(y); free(x); free(R); free(C); free(A); return 0; } //////////////////////////////////////////////////////////////////////////////// //! //! \brief Create a 2D float-stream if the required stream-size exceeds 8192 //! but still less than 8192*8192 = 67108864 //! But we will pass the max size to vary between 0 and 8192. //////////////////////////////////////////////////////////////////////////////// int* create2DFloatStream(::brook::Stream<float>** stream, // stream handle to return int MAX_STREAM_SIZE, // max stream size allowed int streamLen) // minimum length of desired stream { // calculate the width, height of the new stream. unsigned int height = streamLen/MAX_STREAM_SIZE; if(streamLen%MAX_STREAM_SIZE > 0) height ++; unsigned int width = MAX_STREAM_SIZE; // unsigned int width = ceil(sqrt(streamLen)); // unsigned int height = width; unsigned int dim[2] = {width, height}; // create the new brook stream now (*stream) = new ::brook::Stream<float>(2/*rank*/, dim/*dimension*/ ); // return the dimensions of the created stream int *dimension = new int[2]; dimension[0] = dim[0]; dimension[1] = dim[1]; return dimension; } //----------------------------------------------------------------------------------------------- int* create2DIntStream(::brook::Stream<int>** stream, // stream handle to return int MAX_STREAM_SIZE, // max stream size allowed int streamLen) // minimum length of desired stream { // calculate the width, height of the new stream. // unsigned int width = ceil(sqrt(streamLen)); // unsigned int height = width; unsigned int height = streamLen/MAX_STREAM_SIZE; if(streamLen%MAX_STREAM_SIZE > 0) height ++; unsigned int width = MAX_STREAM_SIZE; unsigned int dim[2] = {width, height}; // create the new brook stream now (*stream) = new ::brook::Stream<int>(2/*rank*/, dim/*dimension*/ ); // return the dimensions of the created stream int *dimension = new int[2]; dimension[0] = dim[0]; dimension[1] = dim[1]; return dimension; } //////////////////////////////////////////////////////////////////////////////// //! //! \brief Create a 2D float-stream if the required stream-size exceeds 8192 //! but still less than 8192*8192 = 67108864 //! But we will pass the max size to vary between 0 and 8192. //! The actual array data to be copied might not be a multiple of 8192. //! Hence we have to pad Zeros in the last row of the 2D stream. //! This function allows that automatically. //////////////////////////////////////////////////////////////////////////////// int* create2DFloatStream(::brook::Stream<float>** stream, // stream handle to return int MAX_STREAM_SIZE, // max stream size allowed float** data, // data to copy int dataLen) // data length { // make the width, height of the 2D stream to be equal so that the stream is square. // unsigned int width = ceil(sqrt(dataLen)); // unsigned int height = width; // calculate the width, height of the new stream. unsigned int height = dataLen/MAX_STREAM_SIZE; if(dataLen%MAX_STREAM_SIZE > 0) height ++; unsigned int width = MAX_STREAM_SIZE; unsigned int dim[2] = {width, height}; // create the new stream data float* newData = (float*) malloc( width*height*sizeof(float) ); // copy data into new-data array, pad with zero in the last row float *ptr = *(data); for(int i=0; i<(width*height); i++){ if( i < dataLen ){ newData[i] = ptr[i]; } else newData[i] = 0; // pad Zero } free(ptr); // delete old data *(data) = newData; // set new data as the reference // create the new brook stream now (*stream) = new ::brook::Stream<float>(2/*rank*/, dim/*dimension*/ ); // read the data into the stream (*stream)->read(newData); // return the dimensions of the created stream int *dimension = new int[2]; dimension[0] = dim[0]; dimension[1] = dim[1]; return dimension; } //////////////////////////////////////////////////////////////////////////////// //! //! \brief Create a 2D int-stream if the required stream-size exceeds 8192 //! but still less than 8192*8192 = 67108864 //! But we will pass the max size to vary between 0 and 8192. //! The actual array data to be copied might not be a multiple of 8192. //! Hence we have to pad Zeros in the last row of the 2D stream. //! This function allows that automatically. //////////////////////////////////////////////////////////////////////////////// int* create2DIntStream(::brook::Stream<int>** stream, // stream handle to return int MAX_STREAM_SIZE, // max stream size allowed int** data, // data to copy int dataLen) // data length { // calculate the width, height of the new stream. unsigned int height = dataLen/MAX_STREAM_SIZE; if(dataLen%MAX_STREAM_SIZE > 0) height ++; unsigned int width = MAX_STREAM_SIZE; unsigned int dim[2] = {width, height}; // create the new stream data int* newData = (int*) malloc( width*height*sizeof(int) ); // copy data into new-data array, pad with zero in the last row int * ptr = *(data); for(int i=0; i<(width*height); i++){ if( i < dataLen ){ newData[i] = ptr[i]; } else newData[i] = 0; // pad Zero } free(ptr); // delete old data *(data) = newData; // set new data as the reference // create the new brook stream now (*stream) = new ::brook::Stream<int>(2/*rank*/, dim/*dimension*/ ); // read the data into the stream (*stream)->read(newData); // return the dimensions of the created stream int *dimension = new int[2]; dimension[0] = dim[0]; dimension[1] = dim[1]; return dimension; } //////////////////////////////////////////////////////////////////////////////// //! //! \brief Prepare data to be copied to the GPU streams. //! \param A array containing all non zeros of matrix A, as in CSR format //! \param C array containing the column indexes in compressed row format //! \param R array containing the row indexes in compressed row format //! \param newA new array A, with size adjusted for streams //! \param newC new array C, with size adjusted for streams //! \param nNonZeros total no. of non-zeros in A //! \param N no. of rows in A //! \param nMaxWidth max no. of non-zeros present in any single row of A //////////////////////////////////////////////////////////////////////////////// void prepareGPUData(float* A, int* C, int* R, float* newA, int* newC, int nNonZeros, int N, int nMaxWidth ){ for( int i=0; i< N; i++){ int offset =0; for(int j=R[i]; j<R[i+1]; j++){ newA[nMaxWidth*i+offset] = A[j]; newC[nMaxWidth*i+offset] = C[j]; offset++; } // pad the rest of the area with zeros while(offset < nMaxWidth){ newA[nMaxWidth*i+offset] = 0.0f; newC[nMaxWidth*i+offset] = 0; offset++; } //newR[i]= i*nMaxWidth; } } //////////////////////////////////////////////////////////////////////////////// //! //! \brief Implement sparse y = Ax //! //////////////////////////////////////////////////////////////////////////////// void SpMV(int nrows, int ncols, int *R, int *C, float *A, float *x, float *y){ /* Slower version for(int i = 0; i<nrows; i++){ for(int j = R[i]; j<R[i+1]; j++){ y[i]+= ( A[j] * x[ C[j] ]); } } */ //improved 1 float temp = 0; for(int i = 0; i<nrows; i++){ temp = 0; for(int j = R[i]; j<R[i+1]; j++){ temp += ( A[j] * x[ C[j] ]); } y[i] = temp; } } //print the sparse matrix void printSpM(int N, int *R, int *C, float *A){ for(int i = 0; i<N; i++) for(int j = R[i]; j<R[i+1]; j++){ printf("Element (%d,%d) has the value %f\n",i,C[j],A[j]); } } //Create a random sparse matrix void sprand(int nrows,int ncols,int nnz, int R[],int C[],float A[], int* maxWidth) { int i,j,k,n; double r; n = nrows*ncols; k = 0; int counter = 1; // to count the max no. of non-zeros in a single row. int iMaxWidth = 0; int iRowWidth = 0; for(i = 0; i<nrows; i++) { R[i] = k; iRowWidth = 0; for(j = 0; j<ncols; j++) { r = rand() / (double) RAND_MAX; if(r*(n-(i*ncols+j)) < (nnz-k)) { C[k] = j; A[k] = counter++; /* You can also put a random number as data */ k = k+1; // save the max width of non-zeros in a single row. iRowWidth++; if(iRowWidth > iMaxWidth) iMaxWidth = iRowWidth; } } } R[nrows] = k; *(maxWidth) = iMaxWidth; } void saveToFile(int nrows, int*R, int*C, float*A, float*y){ //Save the sparse matrix to a file to be read by Matlab FILE *file; file = fopen("c:\\matrix.txt","w"); //w = write, a+ = append for(int i=0; i<nrows; i++){ for(int j=R[i]; j<R[i+1]; j++){ //%_f for double precision //%f for single-precision fprintf(file,"%d %d %f\n", i+1, C[j]+1, A[j]);//format:{row# col# value} } } fclose(file); /*done!*/ FILE *filey; //save the vector y to file filey = fopen("c:\\y.txt","w"); //w = write, a+ = append for(int i=0; i<nrows; i++){ fprintf(filey,"%f\n", y[i]);//format:{row# col# value} } fclose(filey); /*done!*/ } void outputGPUData(float* A, int* C, int*R, float *x, float* newA, int* newC, float *newX, int N, int nMaxWidth ){ for( int i=0; i< N; i++){ int offset =0; for(int j=R[i]; j<R[i+1]; j++){ printf("newA[%d]=%f , A[%d]=%f \n",(nMaxWidth*i+offset), newA[nMaxWidth*i+offset],j, A[j] ); printf("newC[%d]=%d , C[%d]=%d \n",(nMaxWidth*i+offset), newC[nMaxWidth*i+offset],j, C[j] ); printf("newX[%d]=%f , x[%d]=%f \n",(nMaxWidth*i+offset), newX[(nMaxWidth*i+offset)],C[j], x[C[j]] ); offset++; } // pad the rest of the area with zeros while(offset < nMaxWidth){ printf("newA[%d]=%f \n",(nMaxWidth*i+offset), newA[nMaxWidth*i+offset] ); printf("newC[%d]=%d \n",(nMaxWidth*i+offset), newC[nMaxWidth*i+offset] ); printf("newX[%d]=%f \n",(nMaxWidth*i+offset), newX[(nMaxWidth*i+offset)] ); offset++; } printf("\n"); } } void printMatrix(float* A, int* C, int*R, int N, int M){ for( int i=0; i< N; i++){ for(int j=0; j<M; j++){ bool flag = true; for(int k = R[i]; k<R[i+1]; k++){ if(j == C[k]){ printf("%.0f ", A[k] ); flag =false; break; } } if(flag) printf("%d ", 0 ); } printf("\n"); } } void prepareTestData(int nnz, int *R, int * C, float* A){ for(int i=0; i<nnz; i++){ A[i] = (i+1); } C[0]=1; C[1]=3; C[2]=0; C[3]=1; C[4]=3; C[5]=1; C[6]=2; C[7]=3; C[8]=2; C[9]=3; C[10]=4; R[0]=0; R[1]=2; R[2]=5; R[3]=8; R[4]=10; R[5]=11; }

Outcomes