cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

pinzo
Journeyman III

kernel submit time is too long

ok,

thanks

#define CALC_THREAD 256 #define PM_X_THREAD 4 #define H_MIN 0 //cm #define H_MAX 1024 //cm #define NUM_BINS 16 #define BINS_RIDOTTI NUM_BINS/4 #define BIN_WIDTH (H_MAX-H_MIN)/NUM_BINS #define NUM_HITS_THR 7 #define PRIMO_PUNTO_X -30 #define PRIMO_PUNTO_Y -30 #define PASSI_X 5 #define PASSI_Y 5 #define PASSO (2*(-PRIMO_PUNTO_X)/PASSI_X)+1 #define LIMITE 170 #pragma OPENCL EXTENSION cl_amd_printf : enable #pragma OPENCL EXTENSION cl_amd_fp64 : enable __kernel void domh(__global float *hits_d,__global int3 *candy_d,__global int *nn_evt_d) //hits_d era un float2 { int nnevt = nn_evt_d[get_group_id(0)]; __local int histos[256]; __local int hh_step2[256]; __local int hh_step3[64]; __local int hh_step4[8]; __local int pre_histox[BINS_RIDOTTI][256]; __local float2 hits_s[32]; //era 2 anche in domh() int indice=0; if(get_local_id(0) < 32) { hits_s[get_local_id(0)].x = -28.8 + 0.9*((int) hits_d[(get_group_id(0)<<5)+get_local_id(0)] % 65); hits_s[get_local_id(0)].y = -28.0592 + 1.55885*((int)hits_d[(get_group_id(0)<<5)+get_local_id(0)] % 2) +3.1177*(floor(native_divide(hits_d[(get_group_id(0)<<5)+get_local_id(0)],65))); //printf("hits_s[%ld].x=%f\n",get_local_id(0),hits_s[get_local_id(0)].x); //printf("hits_s[%ld].y=%f\n",get_local_id(0),hits_s[get_local_id(0)].y); } //inizializza shared memory histos[2 * get_local_id(0)] = 0; histos[(2 * get_local_id(0))+1] = 0; barrier(CLK_LOCAL_MEM_FENCE); //corpo centrale float primo_punto_x = PRIMO_PUNTO_X; float primo_punto_y = PRIMO_PUNTO_Y; float passi_x = PASSI_X; float passi_y = PASSI_Y; int passo = PASSO; int Idx_centerpmx1 = get_local_id(0); int Idx_centerpmx2 = get_local_id(0)+256; int Idx_centerpmx3 = get_local_id(0)+512; int Idx_centerpmx4 = get_local_id(0)+768; float xcenterx1 = primo_punto_x + passi_x * (Idx_centerpmx1 % passo); float ycenterx1 = primo_punto_y + passi_y * floor(Idx_centerpmx1/passo); float xcenterx2 = primo_punto_x + passi_x * (Idx_centerpmx2 % passo); float ycenterx2 = primo_punto_y + passi_y * floor(Idx_centerpmx2/passo); float xcenterx3 = primo_punto_x + passi_x * (Idx_centerpmx3 % passo); float ycenterx3 = primo_punto_y + passi_y * floor(Idx_centerpmx3/passo); float xcenterx4 = primo_punto_x + passi_x * (Idx_centerpmx4 % passo); float ycenterx4 = primo_punto_y + passi_y * floor(Idx_centerpmx4/passo); //printf("x1 = %f y1 = %f x2 = %f y2 = %f x3 = %f y3 = %f\n",xcenterx1, ycenterx1,xcenterx2,ycenterx2,xcenterx3, ycenterx3 ); #pragma unroll for (int i=0;i<BINS_RIDOTTI;i++) { pre_histox[get_local_id(0)]=0; } int maxhx1 = 0; int maxnx1 = 0; if(get_local_id(0)<LIMITE) { // primo istogramma #pragma unroll for (int idx=0;idx<nnevt;idx++) { float xhit = hits_s[idx].x; float yhit = hits_s[idx].y; float disx = (xhit-xcenterx1)*(xhit-xcenterx1)+(yhit-ycenterx1)*(yhit-ycenterx1); int ndisx = floor(native_divide(disx,BIN_WIDTH)); indice = floor(native_divide(ndisx,4)); pre_histox[indice][get_local_id(0)] += (ndisx%4 == 0 && ndisx<NUM_BINS) ? ((1<<24) & 0XFF000000) : 0; pre_histox[indice][get_local_id(0)] += (ndisx%4 == 1 && ndisx<NUM_BINS) ? ((1<<16) & 0X00FF0000) : 0; pre_histox[indice][get_local_id(0)] += (ndisx%4 == 2 && ndisx<NUM_BINS) ? ((1<<8) & 0X0000FF00) : 0; pre_histox[indice][get_local_id(0)] += (ndisx%4 == 3 && ndisx<NUM_BINS) ? (1 & 0X000000FF) : 0; }//creo il primo istogramma #pragma unroll for (int i=0;i<BINS_RIDOTTI;i++) { maxhx1 = (((pre_histox[get_local_id(0)] & 0XFF000000)>>24)>maxhx1) ? (pre_histox[get_local_id(0)] & 0XFF000000)>>24 : maxhx1; maxnx1 = (((pre_histox[get_local_id(0)] & 0XFF000000)>>24)>maxhx1) ? 4*i : maxnx1; maxhx1 = (((pre_histox[get_local_id(0)] & 0X00FF0000)>>16)>maxhx1) ? (pre_histox[get_local_id(0)] & 0X00FF0000)>>16 : maxhx1; maxnx1 = (((pre_histox[get_local_id(0)] & 0X00FF0000)>>16)>maxhx1) ? 4*i+1 : maxnx1; maxhx1 = (((pre_histox[get_local_id(0)] & 0X0000FF00)>>8)>maxhx1) ? (pre_histox[get_local_id(0)] & 0X0000FF00)>>8 : maxhx1; maxnx1 = (((pre_histox[get_local_id(0)] & 0X0000FF00)>>8)>maxhx1) ? 4*i+2 : maxnx1; maxhx1 = ((pre_histox[get_local_id(0)] & 0X000000FF)>maxhx1) ? pre_histox[get_local_id(0)] & 0X000000FF : maxhx1; maxnx1 = ((pre_histox[get_local_id(0)] & 0X000000FF)>maxhx1) ? 4*i+3 : maxnx1; } //ho trovato la migliore distanza del primo istogramma } histos[get_local_id(0)] =((maxnx1<<24) & 0XFF000000) | ((maxhx1<<16) & 0X00FF0000); //ho messo la migliore distanza di ogni PMT con il numero di conteggi nel vettore histos[], due per ogni indice di histos, i bytes maggiori contengono i primi 512 PMT i bytes minori contengono i successivi 512 PMT barrier(CLK_LOCAL_MEM_FENCE); // in questo step uso 256 work-item che confrontano i 1024 histogrammi 4 l'uno int hh_cont_max = 0; int hh_addr_max = 0; int hh_cont =0; hh_cont = (histos[get_local_id(0)] & 0x00FF0000)>>16; hh_cont_max = hh_cont; hh_addr_max = get_local_id(0); hh_step2[get_local_id(0)] = hh_addr_max; barrier(CLK_LOCAL_MEM_FENCE); if(get_local_id(0)<64) { hh_addr_max = 0; hh_cont_max = 0; int idx_step3_1 = hh_step2[get_local_id(0)]; int idx_step3_2 = hh_step2[get_local_id(0) + 64]; int idx_step3_3 = hh_step2[get_local_id(0) + 128]; int idx_step3_4 = hh_step2[get_local_id(0) + 192]; int hh_cont3 = (histos[idx_step3_1] & 0x00FF0000)>>16; if (hh_cont3 > hh_cont_max) { hh_cont_max = hh_cont3; hh_addr_max = idx_step3_1; } hh_cont3 = (histos[idx_step3_2] & 0x00FF0000)>>16; if (hh_cont3 > hh_cont_max) { hh_cont_max = hh_cont3; hh_addr_max = idx_step3_2; } hh_cont3 = (histos[idx_step3_3] & 0x00FF0000)>>16; if (hh_cont3 > hh_cont_max) { hh_cont_max = hh_cont3; hh_addr_max = idx_step3_3; } hh_cont3 = (histos[idx_step3_4] & 0x00FF0000)>>16; if (hh_cont3 > hh_cont_max) { hh_cont_max = hh_cont3; hh_addr_max = idx_step3_4; } hh_step3[get_local_id(0)] = hh_addr_max; } barrier(CLK_LOCAL_MEM_FENCE); if(get_local_id(0)>=64 && get_local_id(0)<72) { hh_addr_max = 0; hh_cont_max = 0; for(int l=0; l<2; l++) //potrei srotolarlo in tutti ed 8 invece che in 4, pero con consumo maggiore di registri { // int idx_step4_1 = hh_step3[4*(get_local_id(0)-64) + 32*l]; // int idx_step4_2 = hh_step3[4*(get_local_id(0)-64) + 32*l+1]; // int idx_step4_3 = hh_step3[4*(get_local_id(0)-64) + 32*l+2]; // int idx_step4_4 = hh_step3[4*(get_local_id(0)-64) + 32*l+3]; int idx_step4_1 = hh_step3[(get_local_id(0)-64) + 32*l]; int idx_step4_2 = hh_step3[(get_local_id(0)-64) + 32*l+8]; int idx_step4_3 = hh_step3[(get_local_id(0)-64) + 32*l+16]; int idx_step4_4 = hh_step3[(get_local_id(0)-64) + 32*l+24]; int hh_cont4 = (histos[idx_step4_1] & 0x00FF0000)>>16; if (hh_cont4 > hh_cont_max) { hh_cont_max = hh_cont4; hh_addr_max = idx_step4_1; } hh_cont4 = (histos[idx_step4_2] & 0x00FF0000)>>16; if (hh_cont4 > hh_cont_max) { hh_cont_max = hh_cont4; hh_addr_max = idx_step4_2; } hh_cont4 = (histos[idx_step4_3] & 0x00FF0000)>>16; if (hh_cont4 > hh_cont_max) { hh_cont_max = hh_cont4; hh_addr_max = idx_step4_3; } hh_cont4 = (histos[idx_step4_4] & 0x00FF0000)>>16; if (hh_cont4 > hh_cont_max) { hh_cont_max = hh_cont4; hh_addr_max = idx_step4_4; } } hh_step4[get_local_id(0)-64] = hh_addr_max; } barrier(CLK_LOCAL_MEM_FENCE); if(get_local_id(0) == 72) { hh_addr_max = 0; hh_cont_max = 0; #pragma unroll for (int i=0;i<8;i+=4) { int idx_step5_1 = hh_step4; int idx_step5_2 = hh_step4[i+1]; int idx_step5_3 = hh_step4[i+2]; int idx_step5_4 = hh_step4[i+3]; int hh_cont5 = (histos[idx_step5_1] & 0x00FF0000)>>16; if (hh_cont5 > hh_cont_max) { hh_cont_max = hh_cont5; hh_addr_max = idx_step5_1; } hh_cont5 = (histos[idx_step5_2] & 0x00FF0000)>>16; if (hh_cont5 > hh_cont_max) { hh_cont_max = hh_cont5; hh_addr_max = idx_step5_2; } hh_cont5 = (histos[idx_step5_3] & 0x00FF0000)>>16; if (hh_cont5 > hh_cont_max) { hh_cont_max = hh_cont5; hh_addr_max = idx_step5_3; } hh_cont5 = (histos[idx_step5_4] & 0x00FF0000)>>16; if (hh_cont5 > hh_cont_max) { hh_cont_max = hh_cont5; hh_addr_max = idx_step5_4; } } candy_d[get_group_id(0)].x = (histos[hh_addr_max] & 0x00FF0000)>>16; candy_d[get_group_id(0)].y= hh_addr_max; candy_d[get_group_id(0)].z = (histos[hh_addr_max] & 0xFF000000)>>24; //printf("candy_d[%d].y(center)=%d\n",get_group_id(0),candy_d[get_group_id(0)].y); //printf("candy_d[%d].x(max)=%d\n",get_group_id(0),candy_d[get_group_id(0)].x); //printf("candy_d[%d].z(radius)=%d\n",get_group_id(0),candy_d[get_group_id(0)].z); } barrier(CLK_LOCAL_MEM_FENCE); return; }

0 Likes
pinzo
Journeyman III

kernel submit time is too long

with the new APP SDK version even the EnqueueReadBuffer and EnqueueWriteBuffer time have increased of about 10 times. why?

0 Likes
maximmoroz
Journeyman III

kernel submit time is too long

The problem is that building your kernel crashes with Catalyst 11.6, which I am unable to uninstall.

Still, some points and suggestions:

1) make parameters hits_d and nn_evt_d 'const' as far as you don't modify the buffers (and add 'restrict' to enable caching):

__kernel void domh(const __global float * restrict hits_d,__global int3 *candy_d, const __global int * restrict nn_evt_d)

2) Possible out-of-range buffer access:

 histos[2 * get_local_id(0)] = 0;
 histos[(2 * get_local_id(0))+1] = 0;

3) If you don't modify private variable make it const. For example:

const int passo = PASSO;

This will enable the compiler to treat 'passo' as constant and optimize subsequent / and % operations.

4) Use mul24 and mad24 whenever possible.

float xcenterx4 = primo_punto_x + passi_x * (Idx_centerpmx4 % passo);

to be replaced (if possible) with:

float xcenterx4 = mad24(passi_x, Idx_centerpmx4 % passo, primo_punto_x);

5) While native_divide is fast it is not as fast as multiplication:

native_divide(disx,BIN_WIDTH)

to be replcaed with:

disx * (1.0 / BIN_WIDTH)

The compilator will do (1.0 / BIN_WIDTH) itself.

The same with native_divide(ndisx,4)

6) The whole cycle prefixed with comment "// primo istogramma". Each work-item updates just "NUM_BINS/4" values in pre_histox array. I am sure it is much faster to update private array of size "NUM_BINS/4", initialized with 0, and then update corresponding indices in the local array.

Hey, you might even write these values to local array after you processed them in the next cycle, which ends with "//ho trovato la migliore distanza del primo istogramma".

7) Then comes some reduction code which I don't understand, sorry. But it looks very suspicous as there is a lot of code which works for several or even just single work item.

😎 Still I suggest you to replace all small 'if's with predicates:

For example, the following code:

   if (hh_cont4 > hh_cont_max)
   {
    hh_cont_max = hh_cont4;
    hh_addr_max = idx_step4_2;
   }

better replace it with:

const bool b4 = hh_cont4 > hh_cont_max;
hh_cont_max = b4 ? hh_cont4 : hh_cont_max;
hh_addr_max = b4 ? idx_step4_2 : hh_addr_max;

Thus you will have less operations and, what is more important, clauses.

9) Write to candy_d: why are you doing it 3 times?

candy_d[get_group_id(0)] = (int3){(histos[hh_addr_max] & 0x00FF0000)>>16, hh_addr_max, (histos[hh_addr_max] & 0xFF000000)>>24};

10) Final barrier(CLK_LOCAL_MEM_FENCE) is useless. It just slows your kernel

And I have a question: What is global_work_size you run your kernel for? And the device?

I also suggest you to read AMD APP OpenCL Programming Guide. It is clear and very useful document.

0 Likes
pinzo
Journeyman III

kernel submit time is too long

thank you I'll try all your suggestions.

the global_work_size is 256 * #events (in this case 256*1000=256000)

and my device is a hd5970

however I don't understand why new APP SDK and/or new driver change in bad the performance of this kernel of a 5 factor and the memory write and read by a 10 factor, do you understand?

0 Likes
maximmoroz
Journeyman III

kernel submit time is too long

The global work size is fine.

No, I have no idea how did it happen.

0 Likes
himanshu_gautam
Grandmaster

kernel submit time is too long

Hi Pinzo,

Can you post a test case.

I would suggest you to use your own timers rather than profiling events as commands in a commandqueue are submitted in bunches to reduce submission time. All commands submitted together have same values for all event profiling timestamps, which can lead to misinterpretation of results.

So I suggest you to execute the kernel inside a loop and find average time.

Use profiler to find per command times.

0 Likes
pinzo
Journeyman III

kernel submit time is too long

what do you mean with test case?

thanks

0 Likes