12 Replies Latest reply on Sep 10, 2010 2:58 PM by Raistmer

    Unexpectedly huge kernel size increase

    Raistmer
      kernel listed

      Hello
      I trying to speedup big kernel by avoiding some global memory acesses.
      For that I added few more registers (small amount) and few more branches (~2).
      Also, 1 new function added to kernel.

      But this slowed down kernel a lot instead of speedup.
      SKA shows very huge kernel size increase for "optimised" variant.

      Why so big increase here? Maybe some thoughts how it can be avoided?

      EDIT: All assembly doesn't fit into code window. Please, try to cut-n-paste kernel in SKA and compile with #if 0 first and with #if 1 then (to enable sumtop2_2 usage)
      Code:

      float4 sumtop2(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0) { //R: this function working with 4 different frequency bins at once as all other pulse finding functions do float4 sum, tmax; int i; tmax = 0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; for (i = 0; i < di; i++) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; sum = i1 + i2; dest[i * (fft_len4)] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop2_2(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0,float4* tmax2) { //R: this function working with 4 different frequency bins at once as all other pulse finding functions do //R: also it adds/searches 2 levels of fold (stride add) at once float4 sum, tmax,sum2; int i; tmax = (float4)0.0f; *tmax2=(float4)0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; int stop=di-1; for (i = 0; i < stop; i++,stop--) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; float4 j1= one[stop * (fft_len4)]; float4 j2= two[stop * (fft_len4)]; sum = i1 + i2; tmax = max(tmax, sum); sum2=j1+j2; tmax = max(tmax, sum2); sum+=sum2; *tmax2=max(*tmax2,sum); dest[i * (fft_len4)] = sum;//R: second stride add stored } return tmax; } float4 sumtop3(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1) { float4 sum, tmax; int i; tmax = 0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; __global float4 *three = tab + tmp1 * (fft_len4); for (i = 0; i < di; i++) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; float4 i3 = three[i * (fft_len4)]; sum = i1 + i2; sum += i3; dest[i * (fft_len4)] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop4(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1, int tmp2) { float4 sum, tmax; int i; __global float4 *one = tab; __global float4 *two = tab + tmp0 * (fft_len4); __global float4 *three = tab + tmp1 * (fft_len4); __global float4 *four = tab + tmp2 * (fft_len4); tmax = 0.0f; for (i = 0; i < di; i++) { float4 i1 = one[i * fft_len4]; float4 i2 = two[i * fft_len4]; float4 i3 = three[i * fft_len4]; float4 i4 = four[i * fft_len4]; sum = i1 + i2; sum += i3; sum += i4; dest[i * fft_len4] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop5(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1, int tmp2, int tmp3) { float4 sum, tmax; int i; __global float4 *one = tab; __global float4 *two = tab + tmp0 * (fft_len4); __global float4 *three = tab + tmp1 * (fft_len4); __global float4 *four = tab + tmp2 * (fft_len4); __global float4 *five = tab + tmp3 * (fft_len4); tmax = 0.0f; for (i = 0; i < di; i++) { float4 i1 = one[i * fft_len4]; float4 i2 = two[i * fft_len4]; float4 i3 = three[i * fft_len4]; float4 i4 = four[i * fft_len4]; float4 i5 = five[i * fft_len4]; sum = i1 + i2; sum += i3; sum += i4; sum += i5; dest[i * fft_len4] = sum; tmax = max(tmax, sum); } return tmax; } typedef struct { //int NumDataPoints; // find_triplets //float *power_ft; //float4* results_ft; //result_flag* result_flags_ft; // find_pulse //float* PulsePot_fp; // Input data //float* PulsePot8_fp; // Input data moved 8 bytes forward for coleased reads //float* tmp_pot_fp; // Temporary array //float* best_pot_fp; // Copy folded pots with best score //float* report_pot_fp; // Copy folded pots for reporting //float4* results_fp; // Additional data for reporting //result_find_pulse_flag* result_flags_fp; //const float* t_funct_cache_fp; // cached results of cudaAcc_t_funct float rcfg_dis_thresh; int PulseMax; } PulseFind_t; #define FOLDS_COUNT 4 #define FOLDS_START 2 float4 t_funct(int di, int num_adds, int j, int PulseMax, __global float* t_funct_cache) { return (float4)t_funct_cache[j * PulseMax * FOLDS_COUNT + (num_adds - FOLDS_START) * PulseMax + di]; } __kernel void PC_find_pulse_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs,int pass, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; int fft_len4=fft_len>>2; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; #if !USE_AVG_CACHE int i; // Calculate average power for (i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; #else avg=PulsePoT_average[ul_PoT+y*fft_len4]; #endif // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); break; } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score ) { result_flag[0].x = 1;return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } } int num_adds_2 = 2* num_adds; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds_2)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds_2)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } This kernel assembly for #if 0 case: ; -------- Disassembly -------------------- 00 ALU: ADDR(1792) CNT(58) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 0 y: AND_INT T3.y, R0.x, (0x000000FF, 3.573311084e-43f).x z: MOV R11.z, 0.0f w: MOV R11.w, 0.0f t: MOV R22.y, 0.0f 1 y: LSHR T2.y, R0.x, (0x00000008, 1.121038771e-44f).x t: RCP_UINT T0.w, KC0[2].x 2 x: MOV R6.x, KC1[1].x y: MOV R0.y, KC1[1].y z: MOV R0.z, KC1[1].z t: MULLO_UINT T0.z, KC0[2].x, PS1 3 x: SUB_INT R0.x, 0.0f, PS2 w: MOV R2.w, KC1[1].w t: MULHI_UINT T0.y, KC0[2].x, T0.w 4 w: CNDE_INT R0.w, PS3, PV3.x, T0.z t: RCP_UINT T1.z, KC0[2].y 5 t: MULHI_UINT ____, PV4.w, T0.w 6 x: ADD_INT ____, T0.w, PS5 y: SUB_INT ____, T0.w, PS5 t: MULLO_UINT T1.y, KC0[2].y, T1.z 7 x: SUB_INT ____, 0.0f, PS6 w: CNDE_INT ____, T0.y, PV6.x, PV6.y t: MULHI_UINT T0.x, KC0[2].y, T1.z 8 y: MOV R21.y, 1 z: CNDE_INT T0.z, PS7, PV7.x, T1.y t: MULHI_UINT T2.z, PV7.w, T2.y 9 y: ADD_INT T1.y, -1, PS8 z: MOV R10.z, 1 w: ADD_INT T0.w, PS8, 1 t: MULLO_UINT ____, PS8, KC0[2].x 10 x: SUB_INT T1.x, T2.y, PS9 w: SETGE_UINT T1.w, T2.y, PS9 t: MULHI_UINT ____, T0.z, T1.z 11 x: SUB_INT ____, T1.z, PS10 y: SETGE_UINT ____, PV10.x, KC0[2].x z: SUB_INT T1.z, PV10.x, KC0[2].x w: ADD_INT ____, T1.z, PS10 t: MOV R10.w, 1 12 x: AND_INT ____, T1.w, PV11.y z: CNDE_INT T0.z, T0.x, PV11.w, PV11.x 13 z: CNDE_INT ____, PV12.x, T2.z, T0.w w: CNDE_INT T0.w, PV12.x, T1.x, T1.z 14 x: CNDE_INT ____, T1.w, T1.y, PV13.z z: ADD_INT ____, KC0[2].x, PV13.w 15 y: CNDE_INT ____, T1.w, PV14.z, T0.w w: CNDE_INT T0.w, KC0[2].x, -1, PV14.x 16 x: CNDE_INT T1.x, KC0[2].x, -1, PV15.y t: MULHI_UINT ____, T0.z, PV15.w 17 t: MULLO_UINT ____, PS16, KC0[2].y 18 z: SETGE_UINT R0.z, T0.w, PS17 w: SUB_INT R0.w, T0.w, PS17 t: MULLO_INT ____, T1.x, KC0[1].x 19 x: SETGE_UINT ____, PV18.w, KC0[2].y y: SUB_INT R0.y, PV18.w, KC0[2].y z: ADD_INT ____, T3.y, PS18 20 x: ADD_INT R0.x, PV19.z, KC0[6].x w: AND_INT R1.w, R0.z, PV19.x 01 MEM_SCRATCH_WRITE_ACK: VEC_PTR[60].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 02 ALU: ADDR(1850) CNT(12) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 21 w: MOV R1.w, R1.w 22 z: CNDE_INT T1.z, PV21.w, R0.w, R0.y 23 z: MOV T1.z, PV22.z 24 y: ADD_INT ____, KC0[2].y, PV23.z 25 w: CNDE_INT ____, R0.z, PV24.y, T1.z 26 z: CNDE_INT ____, KC0[2].y, -1, PV25.w 27 z: MOV ____, PV26.z 28 t: MULLO_INT ____, PV27.z, KC0[1].y 29 w: MOV ____, PS28 30 y: ADD_INT R5.y, PV29.w, KC0[6].y 31 x: LSHR R0.x, KC1[10].x, (0x00000002, 2.802596929e-45f).x 03 WAIT_ACK: Outstanding_acks <= 0 04 TEX: ADDR(9888) CNT(1) 32 RD_SCATTER R0.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 05 ALU_PUSH_BEFORE: ADDR(1862) CNT(2) 33 x: SETE_INT R0.x, R0.x, 1 34 x: PREDE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 06 JUMP POP_CNT(1) ADDR(1763) 07 ALU: ADDR(1864) CNT(4) KCACHE0(CB1:0-15) 35 x: SETGT_INT R2.x, 0.0f, KC0[3].x y: ADD_INT R0.y, KC0[6].x, (0x00000004, 5.605193857e-45f).x t: MULLO_INT R0.x, R5.y, KC0[2].x 08 MEM_SCRATCH_WRITE_ACK: VEC_PTR[61].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 09 ALU: ADDR(1868) CNT(35) KCACHE0(CB1:0-15) 36 x: MOV R0.x, R0.x t: MOV R2.x, R2.x 37 x: SETGT_INT R3.x, KC0[1].x, 0.0f y: ADD_INT T1.y, KC0[1].x, PV36.x w: ADD_INT ____, KC0[3].x, PS36 38 x: LSHR R0.x, R0.y, (0x00000002, 2.802596929e-45f).x z: XOR_INT T0.z, R2.x, PV37.w t: LSHR R1.x, KC0[6].x, (0x00000002, 2.802596929e-45f).x 39 t: RCP_UINT T0.y, PV38.z 40 t: MULLO_UINT T0.x, T0.z, PS39 41 z: SUB_INT ____, 0.0f, PS40 t: MULHI_UINT T0.w, T0.z, T0.y 42 y: CNDE_INT ____, PS41, PV41.z, T0.x 43 t: MULHI_UINT ____, PV42.y, T0.y 44 z: ADD_INT ____, T0.y, PS43 w: SUB_INT ____, T0.y, PS43 45 y: CNDE_INT ____, T0.w, PV44.z, PV44.w 46 t: MULHI_UINT T0.x, PV45.y, (0x00100000, 1.469367939e-39f).x 47 x: ADD_INT T1.x, PS46, 1 w: ADD_INT T0.w, -1, PS46 t: MULLO_UINT ____, PS46, T0.z 48 y: SETGE_UINT T0.y, (0x00100000, 1.469367939e-39f).x, PS47 z: SUB_INT ____, (0x00100000, 1.469367939e-39f).x, PS47 49 w: SETGE_UINT ____, PV48.z, T0.z 50 z: AND_INT ____, T0.y, PV49.w 51 y: CNDE_INT ____, PV50.z, T0.x, T1.x 52 z: CNDE_INT ____, T0.y, T0.w, PV51.y 53 y: CNDE_INT ____, T0.z, -1, PV52.z 54 w: ADD_INT ____, R2.x, PV53.y 55 z: XOR_INT ____, R2.x, PV54.w 56 z: SUB_INT R0.z, PV55.z, KC0[1].x w: SETGT_INT R0.w, T1.y, PV55.z 10 WAIT_ACK: Outstanding_acks <= 0 11 TEX: ADDR(9890) CNT(1) 57 RD_SCRATCH R2.x___, VEC_PTR[61], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 12 ALU: ADDR(1903) CNT(3) 58 z: MOV R0.z, R0.z w: MOV R0.w, R0.w 59 x: CNDE_INT R2.x, PV58.w, R2.x, PV58.z 13 MEM_SCRATCH_WRITE_ACK: VEC_PTR[59].x___, R2, ARRAY_SIZE(66) ELEM_SIZE(3) 14 WAIT_ACK: Outstanding_acks <= 0 15 TEX: ADDR(9892) CNT(2) 60 RD_SCATTER R7.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 61 RD_SCATTER R0.x___, DWORD_PTR[0+R1.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 16 MEM_SCRATCH_WRITE_ACK: VEC_PTR[43].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 17 ALU_PUSH_BEFORE: ADDR(1906) CNT(1) 62 x: PREDNE_INT ____, R3.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 18 JUMP ADDR(30) 19 ALU: ADDR(1907) CNT(2) KCACHE0(CB1:0-15) 63 x: ASHR R2.x, KC0[3].x, (0x00000002, 2.802596929e-45f).x 20 WAIT_ACK: Outstanding_acks <= 0 21 TEX: ADDR(9896) CNT(1) 64 RD_SCRATCH R1.x___, VEC_PTR[60], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 22 TEX: ADDR(9898) CNT(1) 65 RD_SCRATCH R0.x___, VEC_PTR[59], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 23 ALU: ADDR(1909) CNT(18) KCACHE0(CB1:0-15) 66 x: LSHL R5.x, R2.x, (0x00000004, 5.605193857e-45f).x VEC_120 w: ADD_INT ____, R1.x, KC0[11].x t: MULLO_INT ____, R0.x, R2.x 67 x: MOV R2.x, 0.0f y: MOV R0.y, 0.0f z: ADD_INT ____, PS66, PV66.w w: MOV R0.w, 0.0f t: MOV R0.z, 0.0f 68 y: LSHL ____, PV67.z, (0x00000004, 5.605193857e-45f).x 69 x: ADD_INT R0.x, KC0[8].x, PV68.y 70 x: MOV R1.x, PV69.x 71 x: MOV R2.x, R2.x y: MOV R2.y, R0.y z: MOV R2.z, R0.z w: MOV R2.w, R0.w 72 x: MOV R0.x, R6.x 24 LOOP_DX10 i0 FAIL_JUMP_ADDR(30) 25 ALU: ADDR(1927) CNT(5) 73 x: ADD_INT R4.x, -1, R0.x t: LSHR R0.x, R1.x, (0x00000002, 2.802596929e-45f).x 74 x: SETNE_INT R3.x, PV73.x, 0.0f t: ADD_INT R1.x, R5.x, R1.x 26 TEX: ADDR(9900) CNT(1) 75 RD_SCATTER R0, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED 27 ALU_BREAK: ADDR(1932) CNT(5) 76 x: ADD R0.x, R2.x, R0.x y: ADD R4.y, R2.y, R0.y z: ADD R0.z, R2.z, R0.z w: ADD R0.w, R2.w, R0.w 77 x: PREDNE_INT ____, R3.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 28 ALU: ADDR(1937) CNT(5) 78 x: MOV R2.x, R0.x y: MOV R2.y, R4.y z: MOV R2.z, R0.z w: MOV R2.w, R0.w 79 x: MOV R0.x, R4.x 29 ENDLOOP i0 PASS_JUMP_ADDR(25) 30 ELSE POP_CNT(1) ADDR(32) 31 ALU_POP_AFTER: ADDR(1942) CNT(5) 80 x: MOV R0.x, 0.0f 81 x: MOV R0.x, PV80.x y: MOV R4.y, R22.y z: MOV R0.z, R11.z w: MOV R0.w, R11.w 32 ALU: ADDR(1947) CNT(88) KCACHE0(CB1:0-15) 82 x: AND_INT T3.x, R0.x, (0x80000000, -0.0f).x y: AND_INT T2.y, R0.x, (0x7F800000, 1.#INFf).y z: MOV T2.z, R0.w w: AND_INT T3.w, R0.x, (0x807FFFFF, -1.175494211e-38f).z t: I_TO_F R0.w, KC0[1].x 83 x: MOV T0.x, PS82 y: AND_INT T0.y, PV82.z, (0x7F800000, 1.#INFf).x z: AND_INT T0.z, PV82.z, (0x807FFFFF, -1.175494211e-38f).y w: AND_INT T0.w, PV82.z, (0x80000000, -0.0f).z t: MOV R0.y, PS82 84 x: AND_INT ____, PV83.x, (0x807FFFFF, -1.175494211e-38f).x y: SETE_INT T1.y, PV83.y, (0x7F800000, 1.#INFf).y z: SETE_INT T1.z, PV83.y, 0.0f w: AND_INT R1.w, PV83.x, (0x7F800000, 1.#INFf).y t: AND_INT T1.w, PV83.x, (0x80000000, -0.0f).z 85 x: SETE_INT ____, PV84.w, 0.0f y: OR_INT ____, PV84.x, (0x3F800000, 1.0f).x z: OR_INT ____, T0.z, (0x3F800000, 1.0f).x w: SETE_INT T2.w, PV84.w, (0x7F800000, 1.#INFf).y t: SUB_INT T1.x, T0.y, PV84.w 86 x: OR_INT ____, T1.y, PV85.w y: CNDE_INT ____, T1.z, PV85.z, T0.w z: OR_INT ____, T1.z, PV85.x w: CNDE_INT ____, PV85.x, PV85.y, T1.w VEC_021 t: AND_INT T2.x, R0.y, (0x7F800000, 1.#INFf).x 87 x: CNDE_INT ____, T2.w, PV86.w, T0.x y: OR_INT T3.y, PV86.x, PV86.z z: CNDE_INT T2.z, T1.y, PV86.y, T2.z w: SETE_INT T2.w, T2.y, (0x7F800000, 1.#INFf).x VEC_120 t: SETE_INT T1.y, PS86, (0x7F800000, 1.#INFf).x 88 x: SETE_INT ____, T2.y, 0.0f y: CNDE_INT T0.y, PV87.y, T1.x, 0.0f z: OR_INT ____, T3.w, (0x3F800000, 1.0f).x w: SETE_INT ____, T2.x, 0.0f t: RCP_e R4.w, PV87.x 89 x: CNDE_INT ____, PV88.x, PV88.z, T3.x y: ASHR T1.y, PV88.y, (0x00000017, 3.222986468e-44f).x z: MUL_e ____, T2.z, PS88 w: OR_INT ____, T1.y, T2.w t: OR_INT ____, PV88.w, PV88.x 90 x: OR_INT R1.x, PV89.w, PS89 y: AND_INT ____, (0x7FFFFFFF, 1.#QNANf).x, PV89.z z: AND_INT T2.z, (0x80000000, -0.0f).y, PV89.z w: CNDE_INT ____, T2.w, PV89.x, R0.x t: ADD_INT T3.x, T0.y, PV89.z 91 x: SUB_INT ____, T2.y, R1.w y: MUL_e T2.y, R4.w, PV90.w z: OR_INT T1.z, PV90.z, (0x7F800000, 1.#INFf).x w: ASHR ____, PV90.y, (0x00000017, 3.222986468e-44f).y t: MOV T2.w, R0.z 92 x: AND_INT ____, (0x7FFFFFFF, 1.#QNANf).x, PV91.y y: ADD_INT ____, PV91.w, T1.y z: CNDE_INT ____, R1.x, PV91.x, 0.0f w: AND_INT R2.w, (0x80000000, -0.0f).y, PV91.y t: MOV T0.z, R0.w 93 x: SETGE_INT ____, PV92.y, (0x000000FF, 3.573311084e-43f).x y: ASHR ____, PV92.z, (0x00000017, 3.222986468e-44f).y z: ASHR ____, PV92.x, (0x00000017, 3.222986468e-44f).y w: SETGE_INT ____, 0.0f, PV92.y t: ADD_INT R2.x, PV92.z, T2.y 94 x: ADD_INT ____, PV93.z, PV93.y y: OR_INT R1.y, R2.w, (0x7F800000, 1.#INFf).x z: CNDE_INT ____, PV93.w, T3.x, T2.z w: CNDE_INT ____, T3.y, PV93.x, 0.0f t: AND_INT R2.y, T2.w, (0x7F800000, 1.#INFf).x 95 x: CNDE_INT R0.x, PV94.w, PV94.z, T1.z y: SETGE_INT R0.y, PV94.x, (0x000000FF, 3.573311084e-43f).x z: SETGE_INT R2.z, 0.0f, PV94.x w: AND_INT R6.w, T0.z, (0x7F800000, 1.#INFf).y t: AND_INT R3.w, T2.w, (0x807FFFFF, -1.175494211e-38f).z 33 MEM_SCRATCH_WRITE: VEC_PTR[46].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 34 ALU: ADDR(2035) CNT(15) 96 y: MOV R0.y, R0.y z: MOV R2.z, R2.z w: MOV R6.w, R6.w 97 x: SETE_INT R2.x, R2.y, (0x7F800000, 1.#INFf).x y: CNDE_INT ____, PV96.z, R2.x, R2.w z: CNDE_INT ____, R1.x, PV96.y, 0.0f w: SETE_INT R2.w, PV96.w, (0x7F800000, 1.#INFf).x t: SETE_INT R1.x, R2.y, 0.0f 98 x: CNDE_INT R0.x, PV97.z, PV97.y, R1.y y: SETE_INT R0.y, R6.w, 0.0f z: AND_INT R2.z, R0.z, (0x80000000, -0.0f).x w: MOV R9.w, R0.x t: OR_INT R1.y, R3.w, (0x3F800000, 1.0f).y 35 MEM_SCRATCH_WRITE: VEC_PTR[54].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 36 ALU: ADDR(2050) CNT(58) 99 y: MOV R1.y, R1.y z: MOV R2.z, R2.z t: MOV R0.y, R0.y 100 x: CNDE_INT ____, R1.x, PV99.y, PV99.z y: OR_INT ____, PS99, R1.x z: OR_INT ____, R2.w, R2.x VEC_021 w: SUB_INT T3.w, R2.y, R1.w t: MOV ____, R4.y 101 x: OR_INT R3.x, PV100.z, PV100.y y: AND_INT R2.y, PS100, (0x7F800000, 1.#INFf).x z: MOV ____, R0.w w: CNDE_INT ____, R2.x, PV100.x, R0.z t: AND_INT R0.w, PS100, (0x807FFFFF, -1.175494211e-38f).y 102 x: AND_INT R1.x, PV101.z, (0x7F800000, 1.#INFf).x y: MUL_e R0.y, R4.w, PV101.w z: CNDE_INT R0.z, PV101.x, T3.w, 0.0f w: SETE_INT R2.w, PV101.y, (0x7F800000, 1.#INFf).x t: SETE_INT R1.y, PV101.y, 0.0f 103 x: MOV R1.x, R1.x y: MOV R0.y, R0.y z: MOV R0.z, R0.z 104 x: AND_INT R0.x, (0x7FFFFFFF, 1.#QNANf).x, PV103.y y: ASHR R0.y, PV103.z, (0x00000017, 3.222986468e-44f).y z: ADD_INT R0.z, PV103.z, PV103.y w: AND_INT R3.w, (0x80000000, -0.0f).z, PV103.y t: SETE_INT R2.x, PV103.x, (0x7F800000, 1.#INFf).w 105 x: AND_INT ____, R4.y, (0x80000000, -0.0f).x y: OR_INT T2.y, R3.w, (0x7F800000, 1.#INFf).y z: ASHR ____, R0.x, (0x00000017, 3.222986468e-44f).z w: SETE_INT ____, R1.x, 0.0f VEC_120 t: OR_INT ____, R0.w, (0x3F800000, 1.0f).w 106 x: OR_INT ____, PV105.w, R1.y y: CNDE_INT ____, R1.y, PS105, PV105.x VEC_120 z: OR_INT ____, R2.x, R2.w VEC_021 w: ADD_INT ____, PV105.z, R0.y VEC_102 t: SUB_INT T3.w, R2.y, R1.w 107 x: SETGE_INT ____, PV106.w, (0x000000FF, 3.573311084e-43f).x y: OR_INT R2.y, PV106.z, PV106.x z: SETGE_INT ____, 0.0f, PV106.w w: CNDE_INT ____, R2.w, PV106.y, R4.y 108 x: MUL_e ____, R4.w, PV107.w y: CNDE_INT ____, PV107.z, R0.z, R3.w z: CNDE_INT ____, PV107.y, T3.w, 0.0f w: CNDE_INT ____, R3.x, PV107.x, 0.0f 109 x: CNDE_INT R0.x, PV108.w, PV108.y, T2.y y: AND_INT R0.y, (0x7FFFFFFF, 1.#QNANf).x, PV108.x z: ASHR R0.z, PV108.z, (0x00000017, 3.222986468e-44f).y w: AND_INT R0.w, (0x80000000, -0.0f).z, PV108.x t: ADD_INT R1.y, PV108.z, PV108.x 37 MEM_SCRATCH_WRITE: VEC_PTR[45].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 38 ALU: ADDR(2108) CNT(14) 110 x: MOV R0.x, R0.x y: MOV R0.y, R0.y w: MOV R0.w, R0.w 111 x: ASHR ____, PV110.y, (0x00000017, 3.222986468e-44f).x y: OR_INT T1.y, PV110.w, (0x7F800000, 1.#INFf).y z: MOV R9.z, PV110.x 112 w: ADD_INT ____, PV111.x, R0.z 113 y: SETGE_INT ____, PV112.w, (0x000000FF, 3.573311084e-43f).x z: SETGE_INT ____, 0.0f, PV112.w 114 z: CNDE_INT ____, PV113.z, R1.y, R0.w w: CNDE_INT ____, R2.y, PV113.y, 0.0f 115 x: CNDE_INT R0.x, PV114.w, PV114.z, T1.y 39 MEM_SCRATCH_WRITE: VEC_PTR[44].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 40 ALU_PUSH_BEFORE: ADDR(2122) CNT(5) KCACHE0(CB1:0-15) 116 x: MOV R0.x, R0.x 117 y: MOV R20.y, PV116.x 118 x: SETE_INT R0.x, KC0[5].x, (0x00000003, 4.203895393e-45f).x 119 x: PREDNE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 41 JUMP ADDR(44) 42 ALU: ADDR(2127) CNT(14) KCACHE0(CB1:0-15) 120 z: LSHR ____, KC0[1].x, (0x0000001F, 4.344025239e-44f).x w: LSHL T0.w, KC0[1].x, 1 121 y: ADD_INT ____, KC0[1].x, PV120.z t: MULHI_INT ____, PV120.w, (0xAAAAAAAB, -3.031649096e-13f).x 122 x: ASHR R1.x, PV121.y, 1 y: ADD_INT ____, T0.w, PS121 123 x: LSHR ____, PV122.y, (0x0000001F, 4.344025239e-44f).x w: ASHR ____, PV122.y, 1 124 x: ADD_INT R0.x, PV123.x, PV123.w 125 x: MOV R4.x, PV124.x 126 x: MOV R0.x, R1.x 43 MEM_SCRATCH_WRITE_ACK: VEC_PTR[57].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 44 ELSE POP_CNT(1) ADDR(55) 45 ALU_PUSH_BEFORE: ADDR(2141) CNT(3) KCACHE0(CB1:0-15) 127 x: SETE_INT R0.x, KC0[5].x, (0x00000004, 5.605193857e-45f).x 128 x: PREDNE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 46 JUMP ADDR(48) 47 ALU: ADDR(2144) CNT(13) KCACHE0(CB1:0-15) 129 t: MULLO_INT T0.w, KC0[1].x, (0x00000003, 4.203895393e-45f).x 130 w: ASHR ____, PS129, (0x0000001F, 4.344025239e-44f).x t: MULHI_INT ____, PS129, (0x66666667, 2.720083202e23f).y 131 x: ASHR ____, PS130, 1 y: LSHR ____, PS130, (0x0000001F, 4.344025239e-44f).x z: LSHR ____, PV130.w, (0x0000001E, 4.203895393e-44f).y 132 x: ADD_INT R6.x, PV131.y, PV131.x y: ADD_INT ____, T0.w, PV131.z 133 x: ASHR R4.x, PV132.y, (0x00000002, 2.802596929e-45f).x 48 ELSE POP_CNT(1) ADDR(52) 49 ALU_PUSH_BEFORE: ADDR(2157) CNT(3) KCACHE0(CB1:0-15) 134 x: SETE_INT R4.x, KC0[5].x, (0x00000005, 7.006492322e-45f).x 135 x: PREDNE_INT ____, R4.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 50 JUMP POP_CNT(2) ADDR(52) 51 ALU_POP2_AFTER: ADDR(2160) CNT(14) KCACHE0(CB1:0-15) 136 x: LSHL T0.x, KC0[1].x, (0x00000002, 2.802596929e-45f).x 137 t: MULHI_INT ____, PV136.x, (0xAAAAAAAB, -3.031649096e-13f).x 138 z: ADD_INT ____, T0.x, PS137 t: MULHI_INT ____, T0.x, (0x66666667, 2.720083202e23f).x 139 x: ASHR ____, PV138.z, (0x00000002, 2.802596929e-45f).x y: LSHR ____, PV138.z, (0x0000001F, 4.344025239e-44f).y z: LSHR ____, PS138, (0x0000001F, 4.344025239e-44f).y w: ASHR ____, PS138, 1 140 x: ADD_INT R6.x, PV139.y, PV139.x t: ADD_INT R4.x, PV139.z, PV139.w 52 ALU: ADDR(2174) CNT(1) 141 x: MOV R0.x, R6.x 53 MEM_SCRATCH_WRITE_ACK: VEC_PTR[57].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 54 POP (1) ADDR(55) 55 WAIT_ACK: Outstanding_acks <= 0 56 TEX: ADDR(9902) CNT(1) 142 RD_SCRATCH R2.x___, VEC_PTR[60], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 57 TEX: ADDR(9904) CNT(1) 143 RD_SCRATCH R1.x___, VEC_PTR[59], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 58 ALU: ADDR(2175) CNT(7) KCACHE0(CB1:0-15) 144 x: ASHR R0.x, KC0[3].x, (0x00000002, 2.802596929e-45f).x 145 x: ADD_INT R2.x, R2.x, KC0[11].x y: ADD_INT R1.y, (0xF

        • Unexpectedly huge kernel size increase
          Raistmer
          It's even strangier than I thought.

          If both returns are commented out (as in attached fragment of initial kernel) assembly consists of >2000 clauses with >6000 instructions.
          But when first "return" uncommented SKA reports only ~600 clauses with ~2000 instructions.

          What compiler does here at all ???? Does it generate completely different full execution path for each and every branch?

          It was stated in manual that when all threads decide the same on branch, branch instruction costs very little. And only if they devergent in wavefront cost increase. In my case there should be very little amount of divergent branches but if kernel size increases SO hugely....

          Will this inefficiency be adressed in next release ?
          And does SKA generated code resembe actually used code at all?

          EDIT: and btw, when both returns are uncommented SKA shows ~870 clauses.
          Where any rule here ???

          if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;//return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; }

            • Unexpectedly huge kernel size increase
              Raistmer
              LoL, and if that part changed in listed way - SKA can't compile kernel at all.

              OpenCL Compile Error: clBuildProgram failed (CL_BUILD_PROGRAM_FAILURE).

              Error: Creating kernel PC_find_pulse_kernel_cl failed!

              All this look like AMD OpenCL compiler just unable to deal with more or less complicated kernels.

              Should this bug be reported via mail or forum message is enough?

              if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score || (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; }

                • Unexpectedly huge kernel size increase
                  himanshu.gautam

                  raistmer,

                  When we use #if 1 we are calling the function sumtop2 inside a for loop.But Gpus still do not use stacks for function calling.Therefore Sumtop2 function is inlined everytime it is called,which drastically increases the size of your code.

                   

                  Regarding the response of SKA:

                  It is a known issue and i confirm it as a bug.

                  For more info see the link :http://forums.amd.com/forum/messageview.cfm?catid=328&threadid=138904&forumid=9

                   

                    • Unexpectedly huge kernel size increase
                      HarryH

                       

                      Originally posted by: himanshu.gautam raistmer,

                       

                      When we use #if 1 we are calling the function sumtop2 inside a for loop.But Gpus still do not use stacks for function calling.Therefore Sumtop2 function is inlined everytime it is called,which drastically increases the size of your code.



                       

                      This implies that the loop is unrolled by the compiler which can only be done when the

                      loop start and end are known and fixed at compile time. I wrote two simple test cases

                      that demonstrate this in the first case the loop is completely unrolled, inlining the

                      function body multiple times, leading to larger

                      code without any loop contructs. In the second case the function body is inlined only

                      once leading to smaller code. It seems a real function calling mechanism is very

                      necessary otherwise you might just as well not use functions at all, as they are

                      merely macros.

                      int xxx(int i) { return i*2; } //*********** Loop is unrolled ***************** __kernel void test(__global int *x) { for (int i = 0; i <5; i++) { x[i] = xxx(i); } } //*********** Loop is NOT unrolled ************* __kernel void test(__global int *x, int n) { for (int i = 0; i < n; i++) { x[i] = xxx(i); } } Don't worry about the fact that x is overwritten by multiple kernel instances, not the point of this demo

                        • Unexpectedly huge kernel size increase
                          nou

                          unroling loop is regular optimization technique. this seems that compiler unroling too much.

                            • Unexpectedly huge kernel size increase
                              HarryH

                               

                              Originally posted by: nou unroling loop is regular optimization technique. this seems that compiler unroling too much.

                               

                              Yes, normally unrolling loops with a body that is not too large is good for performance,

                              but when you start inlining potentially large function bodies instead of function calls this

                              leads to code explosions. Implementing regular function calls would be the best option

                              IMO if possible. Otherwise something like this might help:

                              #pragma please_dont_unroll_this_next_loop

                    • Unexpectedly huge kernel size increase
                      Raistmer
                      1)Sorry, but I didn't understand explanation very well. Please, look on original kernel once more.
                      You will see that in #if 0 case function sumtop2 called. In #if 1 case function sumtop2_2 called.
                      They are quite similar so I still don't understand why so big change in code size.
                      2) How either of these functions be unrolled completely if loop number setted to kernel as parameter? For my app it can by something from 3 to 10 but different each time. How compiler can do any unrolls for unknown number of loops ???
                      (ndivs parameter passed as kernel parameter)
                        • Unexpectedly huge kernel size increase
                          Raistmer
                          To further simplify kernel I splitted it on 3 different ones.
                          That is 1 parameter less + 2 switches inside kernel body are removed.
                          But app performance hugely decreases when I call these 3 kernels one by one instead of calling original one with 3,4,5 pass parameter values.

                          code attached, sumtop functions are listed in first post.

                          And some data from SKA (with simplified kernel it's able to show some statistics):

                          PC_find_pulse_kernel_cl:
                          Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                          Radeon HD 4870,0,55,2.70,134.50,18.60,18.60,18.60,18.60,ALU Ops,0.86,645 M Threads\Sec

                          PC_find_pulse3_kernel_cl:
                          Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                          Radeon HD 4870,0,61,2.70,341.00,36.60,36.60,36.60,2.09,ALU Ops,0.44,328 M Threads\Sec

                          PC_find_pulse4_kernel_cl:
                          Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                          Radeon HD 4870,0,62,3.00,134.90,19.11,19.11,19.11,19.11,ALU Ops,0.84,628 M Threads\Sec

                          PC_find_pulse5_kernel_cl:
                          Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                          Radeon HD 4870,1,62,2.90,237.20,30.34,30.34,30.34,2.50,ALU Ops,0.53,396 M Threads\Sec

                          Original kernel handles all those 3 cases, how it can be faster than some of separate cases??
                          And for last (pulse5) kernel compiler uses 1 scratch register while it uses no scratch registers for bigger and more complex original kernel.

                          IMO all this illustrates very poor decisions made by compiler to produce assembly code.

                          Any thoughts how to help it to make more adequate code ?
                          What guidlines can be applied here ?

                          __kernel void PC_find_pulse_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs,int pass, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; int fft_len4=fft_len>>2; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; #define USE_AVG_CACHE 1 #if !USE_AVG_CACHE int i; // Calculate average power for (i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; #else avg=PulsePoT_average[ul_PoT+y*fft_len4]; #endif // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); break; } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 2* num_adds; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } __kernel void PC_find_pulse3_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 256*1024/fft_len4;//R: fft_len4 == fft_len>>2 int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; // Calculate average power for (int i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; PulsePoT_average[ul_PoT+y*fft_len4]=avg;//R: pulse3 called first, it computes average and stores it for pulse4 and pulse5 passes // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = 3; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; //int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1;//, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (6); perdiv = 2; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, 3, 0, PulseMax, t_funct_cache_fp) * avg; tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 3; cur_thresh = (dis_thresh / 3 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 6; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, 3, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } __kernel void PC_find_pulse4_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 256*1024/fft_len4; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len4];//R: Value computed in pulse3 // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = 4; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; //int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2;//, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (4); perdiv = 3; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, 4, 0, PulseMax, t_funct_cache_fp) * avg; tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 4.f; cur_thresh = (dis_thresh / 4.f - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 8; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, 4, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } __kernel void PC_find_pulse5_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 256*1024/fft_len4; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len4]; // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = 5; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; //int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (3); perdiv = 4; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, 5, 0, PulseMax, t_funct_cache_fp) * avg; tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 5.f; cur_thresh = (dis_thresh / 5.f - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 10; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, 5, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = }

                            • Unexpectedly huge kernel size increase
                              Raistmer
                              And when I return additional parameter to kernel (making impossible for compiler to know what number of loops will be performed) SKA shows almost x2 performance increase !
                              (updated kernel attached)

                              PC_find_pulse5_kernel_cl:
                              Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                              Radeon HD 4870,1,62,3.00,135.50,19.41,19.41,19.41,19.41,ALU Ops,0.82,618 M Threads\Sec

                              There is still unneeded scratch register usage, but number of threads per second increased from 396M to 618M !
                              (Though, it still worse than 645M/s for original complex kernel!)

                              Is there some way to restrict compiler from such mad unrolling ?

                              __kernel void PC_find_pulse5_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4p, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average,int pass) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP int fft_len4=fft_len4p; const int PoTLen = 256*1024/fft_len4; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len4]; // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (3); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 5.f; cur_thresh = (dis_thresh / 5.f - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 10; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = }

                                • Unexpectedly huge kernel size increase
                                  Raistmer
                                  And one more "nice" thing:

                                  If in listed above kernel

                                  int num_adds_2 = 10;

                                  string used SKA reports:

                                  Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                                  Radeon HD 4870,1,62,3.00,135.50,19.41,19.41,19.41,19.41,ALU Ops,0.82,618 M Threads\Sec

                                  But if I replace it with:

                                  int num_adds_2 = num_adds<<1;

                                  SKA reports:

                                  Name,Scratch Reg,GPR,Min,Max,Avg,Est Cycles,Est Cycles(Bi),ALU:Fetch(Bi),BottleNeck(Bi),%s\Clock(Bi),Throughput(Bi)
                                  Radeon HD 4870,0,52,2.70,693.90,79.16,79.16,79.16,6.53,ALU Ops,0.20,152 M Threads\Sec

                                  That is, instead of 62 +1 scratch registers it now uses 52 and 0 scratch, but performance suddenly dropps from 618M to 152M !

                                  Looks like performance of resulting assembly code is almost uncontrollable

                                  What guidlines can be applied ?
                                    • Unexpectedly huge kernel size increase
                                      himanshu.gautam

                                      raistmer,

                                      i am sorry if i am getting it wrong.

                                      But AFAIK when #if 0 only sumtop2 is called, but when #if 1 both sumtop2_2 and sumtop2 are called.

                                      tmp_max2 is initialised to 0 initially at #if 1 for every tmp_max2.x==0:

                                      sumtop2_2() is executed

                                      and the immidiate next if becomes false so sumtop2 in corrosponding else is also executed.

                                      As such there are so many branch conditional so any waverfront which encounters this code will have to execute it many times,that too repeatedly in a for loop.Diverging branch conditions slow down execution disastrously.

                                      // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds_2)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds_2)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } }

                                        • Unexpectedly huge kernel size increase
                                          Raistmer
                                          You got it right, both sumtop functions will be called (usually) per kernel call.
                                          But in branch divergence part - there should be complete coherency in branch in listed part.
                                          This part doesn't depend from thread ID.
                                          ndivs is kernel's parameter (that is, the same for all threads) and tmp_2 will be zero once per 2 loop iterations, again, for all threads in kernel (and in wavefront too).
                                          sumtop functions never can return zero (dataset organized that way).

                                          Slowdown can come from different memory access patters in sumtop2_2 (perhaps), but still not quite understand why so big kernel size increase.
                                          Even if sumtop function code takes almost all whole kernel code I would expect increase by ~1/3 size, but I see few times increase...
                                          Well, thanks for your comments, I will try to modify code in different way.

                                          #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif