slice4e

Problem with kernel Attribute or domainSize

Discussion created by slice4e on Aug 29, 2009
Latest reply on Aug 31, 2009 by Ceq
Problem with kernel Attribute or domainSize

I have written an implementation of a reduction in Brook+ 1.4 - comparing two matrices A and B and outputting if they are the same or not.

My code works as follows:

1) One thread scans down one column of A and B and compares the elements.

2) Using a Attribute, I define a thread goup of 64 threads. In the second step, after those 64 threads have scanned the columns, they communicate using local data store and reduce their results to a single output.

 

3) I have enhanced this implementation, by using creating more threads and each thread scans only part ways down the column. To do this, I use kernel.domainSize( ... ). I also use float4, so each thread essentially scans 4 columns.

 

My Problem:

My code works perfectly fine the first iteration. However, if I run it within a loop with several iteration, it fails to produce the correct results. After some debugging - it seems to me that the second time around my threads are not created properly ( printing their thread  ids does not make any sense the second iteration). I think that this has something to do with Attribute or Domain and I cannot figure it out for 2 days now.It also seems that my output stream has to be the same size as my domain - for my code to work. In other words I think that domain does not work.

 

I have attached my code. Any help will be appreciated.

 

 

My code works per

 

 

Attribute[GroupSize(64, 1, 1)] kernel void compare_mats_float4(int group_size, int elem_per_column, float4 a[][], float4 b[][], out float4 c[][]) { shared float4 lds[64]; // defines the amount of data per thread as 1 float4 (64/64) int index_y = instance().y; int index_x = instance().x; int i; float4 error = float4(0.0f, 0.0f, 0.0f, 0.0f); // Each thread scans elem_per_column elements, down 4 columns (because float4) for(i=0; i<elem_per_column; i=i+1){ // accumulate the error for the 4 columns error = error + (a[index_y * elem_per_column + i][index_x] - b[index_y * elem_per_column + i][index_x]); } // Each thread writes to LDS if there was an error in its columns. lds[1 * instanceInGroup().x + 0] = error; syncGroup(); // Since I know the size of my thread group is 64, I am going to completely unroll this // Perform the reduction using shared memory if(instanceInGroup().x < 32){ lds[1 * instanceInGroup().x + 0] = lds[1 * instanceInGroup().x + 0] + lds[1 * instanceInGroup().x + 32]; } syncGroup(); if(instanceInGroup().x < 16){ lds[1 * instanceInGroup().x + 0] = lds[1 * instanceInGroup().x + 0] + lds[1 * instanceInGroup().x + 16]; } syncGroup(); if(instanceInGroup().x < 8){ lds[1 * instanceInGroup().x + 0] = lds[1 * instanceInGroup().x + 0] + lds[1 * instanceInGroup().x + 8]; } syncGroup(); if(instanceInGroup().x < 4){ lds[1 * instanceInGroup().x + 0] = lds[1 * instanceInGroup().x + 0] + lds[1 * instanceInGroup().x + 4]; } syncGroup(); if(instanceInGroup().x < 2){ lds[1 * instanceInGroup().x + 0] = lds[1 * instanceInGroup().x + 0] + lds[1 * instanceInGroup().x + 2]; } syncGroup(); // The last thread write the result of the thread group to global memory if(instanceInGroup().x == 0){ c[index_y][index_x/group_size] = lds[1 * instanceInGroup().x + 0] + lds[1 * instanceInGroup().x + 1]; } //c[index_y][index_x] = float4((float)index_x, (float)index_y, 0.0f, 0.0f); } // Project headers #include "brookgenfiles/lds.h" #include "brook/brook.h" #include "math.h" #include "Timer.h" #define WIDTH 128 // 4096/4 - account for the float4 #define HEIGHT 512 #define COMPONENTS 4 #define GROUP_SIZE 64 #define ELEM_PER_COLUMN 256 // Number of (float4) elements that each thread will scan down the column int main() { CPerfCounter* timer_CPU = new CPerfCounter(); CPerfCounter* timer_GPU = new CPerfCounter(); CPerfCounter* timer_mem = new CPerfCounter(); double time_GPU=0.0; double time_CPU=0.0; double time_mem=0.0; float sum_GPU =0.0f; float sum_CPU =0.0f; int iterations = 2; unsigned int width = WIDTH; unsigned int height = HEIGHT; unsigned int group_size = GROUP_SIZE; unsigned int width_reduced = width; // width of the reduced matrix, at least 64 unsigned int height_reduced = HEIGHT/ELEM_PER_COLUMN; // height of the reduced matrix unsigned int elem_per_column = ELEM_PER_COLUMN; unsigned int actual_width_reduced = (WIDTH / GROUP_SIZE)*COMPONENTS; // reduced with in terms of individual elements unsigned int i = 0, j = 0; unsigned int streamSize[] = {width, height}; unsigned int streamSize2[] = {width_reduced, height_reduced}; // Stream allocation Stream<float4> streamA(2, streamSize); Stream<float4> streamB(2, streamSize); Stream<float4> streamO(2, streamSize2); // same size as my kernel domain // Declaration of input & output buffer float *A = NULL; float *B = NULL; float *O = NULL; // Pinned Memory allocation to input output & expected buffer A = (float*)_aligned_malloc(height * width * sizeof(float) * COMPONENTS, 256); B = (float*)_aligned_malloc(height * width * sizeof(float) * COMPONENTS, 256); O = (float*)_aligned_malloc(width_reduced * height_reduced * sizeof(float)* COMPONENTS, 256); // Value assign to input buffer for(i = 0; i < (height * width * COMPONENTS); ++i) { A[i] = (float)0; B[i] = (float)0; } // Introduce some differences between A and B, to verify that we catch them. B[257] = (float)1; B[32] = (float)1; B[570] = (float)1; // Do several iterations for(unsigned int x=0; x<iterations; x++){ timer_CPU->Reset(); timer_GPU->Reset(); timer_mem->Reset(); sum_GPU = 0.0f; sum_CPU = 0.0f; timer_GPU->Start(); // Pinned memory read streamA.read(A,"nocopy"); streamB.read(B,"nocopy"); // Each thread will scan down the column (4 columns since float4) // Each thread will scan down ELEM_PER_COLUMN elements. compare_mats_float4.domainOffset(uint4(0,0,0,0)); compare_mats_float4.domainSize(uint4(width,height_reduced,1,1)); compare_mats_float4(group_size, elem_per_column, streamA, streamB, streamO); if(streamO.error()) { printf("Error : %s", streamO.errorLog()); } streamO.write(O, "nocopy"); // Finish the reduction on the CPU for(i = 0; i < height_reduced; ++i){ for(j = 0; j < actual_width_reduced; ++j){ //for(j = 0; j < width_reduced * 4; ++j){ sum_GPU += O[i* width_reduced * 4 + j]; if((j>1)&&(j%64 ==0)) printf("\n"); printf("%0.0f ", O[i * width_reduced * 4 +j]); } printf("\n-------------------------\n"); } printf("\n-------------------------\n"); timer_GPU->Stop(); // MEMORY TRANSFER ONLY timer_mem->Start(); streamA.read(A,"nocopy"); streamB.read(B,"nocopy"); streamA.finish(); streamB.finish(); streamO.write(O, "nocopy"); streamO.finish(); timer_mem->Stop(); // Do reduction on the CPU timer_CPU->Start(); for(i = 0; i < (height * width * COMPONENTS); ++i) { sum_CPU += A[i] - B[i]; } timer_CPU->Stop(); time_GPU += timer_GPU->GetElapsedTime(); time_CPU += timer_CPU->GetElapsedTime(); time_mem += timer_mem->GetElapsedTime(); printf("sum_GPU: %0.2f\n", sum_GPU); printf("sum_CPU: %0.2f\n", sum_CPU); printf("\n"); }//END Iterations printf("avg_time GPU: %lf avg_mem_time: %lf\n", time_GPU/iterations, time_mem/iterations); printf("avg_time CPU: %lf\n", time_CPU/iterations); printf("\n"); // Cleaning up _aligned_free(A); _aligned_free(B); _aligned_free(O); }

Outcomes