float4 sumtop2(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0) { //R: this function working with 4 different frequency bins at once as all other pulse finding functions do float4 sum, tmax; int i; tmax = 0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; for (i = 0; i < di; i++) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; sum = i1 + i2; dest[i * (fft_len4)] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop2_2(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0,float4* tmax2) { //R: this function working with 4 different frequency bins at once as all other pulse finding functions do //R: also it adds/searches 2 levels of fold (stride add) at once float4 sum, tmax,sum2; int i; tmax = (float4)0.0f; *tmax2=(float4)0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; int stop=di-1; for (i = 0; i < stop; i++,stop--) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; float4 j1= one[stop * (fft_len4)]; float4 j2= two[stop * (fft_len4)]; sum = i1 + i2; tmax = max(tmax, sum); sum2=j1+j2; tmax = max(tmax, sum2); sum+=sum2; *tmax2=max(*tmax2,sum); dest[i * (fft_len4)] = sum;//R: second stride add stored } return tmax; } float4 sumtop3(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1) { float4 sum, tmax; int i; tmax = 0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; __global float4 *three = tab + tmp1 * (fft_len4); for (i = 0; i < di; i++) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; float4 i3 = three[i * (fft_len4)]; sum = i1 + i2; sum += i3; dest[i * (fft_len4)] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop4(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1, int tmp2) { float4 sum, tmax; int i; __global float4 *one = tab; __global float4 *two = tab + tmp0 * (fft_len4); __global float4 *three = tab + tmp1 * (fft_len4); __global float4 *four = tab + tmp2 * (fft_len4); tmax = 0.0f; for (i = 0; i < di; i++) { float4 i1 = one[i * fft_len4]; float4 i2 = two[i * fft_len4]; float4 i3 = three[i * fft_len4]; float4 i4 = four[i * fft_len4]; sum = i1 + i2; sum += i3; sum += i4; dest[i * fft_len4] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop5(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1, int tmp2, int tmp3) { float4 sum, tmax; int i; __global float4 *one = tab; __global float4 *two = tab + tmp0 * (fft_len4); __global float4 *three = tab + tmp1 * (fft_len4); __global float4 *four = tab + tmp2 * (fft_len4); __global float4 *five = tab + tmp3 * (fft_len4); tmax = 0.0f; for (i = 0; i < di; i++) { float4 i1 = one[i * fft_len4]; float4 i2 = two[i * fft_len4]; float4 i3 = three[i * fft_len4]; float4 i4 = four[i * fft_len4]; float4 i5 = five[i * fft_len4]; sum = i1 + i2; sum += i3; sum += i4; sum += i5; dest[i * fft_len4] = sum; tmax = max(tmax, sum); } return tmax; } typedef struct { //int NumDataPoints; // find_triplets //float *power_ft; //float4* results_ft; //result_flag* result_flags_ft; // find_pulse //float* PulsePot_fp; // Input data //float* PulsePot8_fp; // Input data moved 8 bytes forward for coleased reads //float* tmp_pot_fp; // Temporary array //float* best_pot_fp; // Copy folded pots with best score //float* report_pot_fp; // Copy folded pots for reporting //float4* results_fp; // Additional data for reporting //result_find_pulse_flag* result_flags_fp; //const float* t_funct_cache_fp; // cached results of cudaAcc_t_funct float rcfg_dis_thresh; int PulseMax; } PulseFind_t; #define FOLDS_COUNT 4 #define FOLDS_START 2 float4 t_funct(int di, int num_adds, int j, int PulseMax, __global float* t_funct_cache) { return (float4)t_funct_cache[j * PulseMax * FOLDS_COUNT + (num_adds - FOLDS_START) * PulseMax + di]; } __kernel void PC_find_pulse_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs,int pass, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; int fft_len4=fft_len>>2; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; #if !USE_AVG_CACHE int i; // Calculate average power for (i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; #else avg=PulsePoT_average[ul_PoT+y*fft_len4]; #endif // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); break; } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score ) { result_flag[0].x = 1;return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } } int num_adds_2 = 2* num_adds; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds_2)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds_2)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } This kernel assembly for #if 0 case: ; -------- Disassembly -------------------- 00 ALU: ADDR(1792) CNT(58) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 0 y: AND_INT T3.y, R0.x, (0x000000FF, 3.573311084e-43f).x z: MOV R11.z, 0.0f w: MOV R11.w, 0.0f t: MOV R22.y, 0.0f 1 y: LSHR T2.y, R0.x, (0x00000008, 1.121038771e-44f).x t: RCP_UINT T0.w, KC0[2].x 2 x: MOV R6.x, KC1[1].x y: MOV R0.y, KC1[1].y z: MOV R0.z, KC1[1].z t: MULLO_UINT T0.z, KC0[2].x, PS1 3 x: SUB_INT R0.x, 0.0f, PS2 w: MOV R2.w, KC1[1].w t: MULHI_UINT T0.y, KC0[2].x, T0.w 4 w: CNDE_INT R0.w, PS3, PV3.x, T0.z t: RCP_UINT T1.z, KC0[2].y 5 t: MULHI_UINT ____, PV4.w, T0.w 6 x: ADD_INT ____, T0.w, PS5 y: SUB_INT ____, T0.w, PS5 t: MULLO_UINT T1.y, KC0[2].y, T1.z 7 x: SUB_INT ____, 0.0f, PS6 w: CNDE_INT ____, T0.y, PV6.x, PV6.y t: MULHI_UINT T0.x, KC0[2].y, T1.z 8 y: MOV R21.y, 1 z: CNDE_INT T0.z, PS7, PV7.x, T1.y t: MULHI_UINT T2.z, PV7.w, T2.y 9 y: ADD_INT T1.y, -1, PS8 z: MOV R10.z, 1 w: ADD_INT T0.w, PS8, 1 t: MULLO_UINT ____, PS8, KC0[2].x 10 x: SUB_INT T1.x, T2.y, PS9 w: SETGE_UINT T1.w, T2.y, PS9 t: MULHI_UINT ____, T0.z, T1.z 11 x: SUB_INT ____, T1.z, PS10 y: SETGE_UINT ____, PV10.x, KC0[2].x z: SUB_INT T1.z, PV10.x, KC0[2].x w: ADD_INT ____, T1.z, PS10 t: MOV R10.w, 1 12 x: AND_INT ____, T1.w, PV11.y z: CNDE_INT T0.z, T0.x, PV11.w, PV11.x 13 z: CNDE_INT ____, PV12.x, T2.z, T0.w w: CNDE_INT T0.w, PV12.x, T1.x, T1.z 14 x: CNDE_INT ____, T1.w, T1.y, PV13.z z: ADD_INT ____, KC0[2].x, PV13.w 15 y: CNDE_INT ____, T1.w, PV14.z, T0.w w: CNDE_INT T0.w, KC0[2].x, -1, PV14.x 16 x: CNDE_INT T1.x, KC0[2].x, -1, PV15.y t: MULHI_UINT ____, T0.z, PV15.w 17 t: MULLO_UINT ____, PS16, KC0[2].y 18 z: SETGE_UINT R0.z, T0.w, PS17 w: SUB_INT R0.w, T0.w, PS17 t: MULLO_INT ____, T1.x, KC0[1].x 19 x: SETGE_UINT ____, PV18.w, KC0[2].y y: SUB_INT R0.y, PV18.w, KC0[2].y z: ADD_INT ____, T3.y, PS18 20 x: ADD_INT R0.x, PV19.z, KC0[6].x w: AND_INT R1.w, R0.z, PV19.x 01 MEM_SCRATCH_WRITE_ACK: VEC_PTR[60].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 02 ALU: ADDR(1850) CNT(12) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 21 w: MOV R1.w, R1.w 22 z: CNDE_INT T1.z, PV21.w, R0.w, R0.y 23 z: MOV T1.z, PV22.z 24 y: ADD_INT ____, KC0[2].y, PV23.z 25 w: CNDE_INT ____, R0.z, PV24.y, T1.z 26 z: CNDE_INT ____, KC0[2].y, -1, PV25.w 27 z: MOV ____, PV26.z 28 t: MULLO_INT ____, PV27.z, KC0[1].y 29 w: MOV ____, PS28 30 y: ADD_INT R5.y, PV29.w, KC0[6].y 31 x: LSHR R0.x, KC1[10].x, (0x00000002, 2.802596929e-45f).x 03 WAIT_ACK: Outstanding_acks <= 0 04 TEX: ADDR(9888) CNT(1) 32 RD_SCATTER R0.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 05 ALU_PUSH_BEFORE: ADDR(1862) CNT(2) 33 x: SETE_INT R0.x, R0.x, 1 34 x: PREDE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 06 JUMP POP_CNT(1) ADDR(1763) 07 ALU: ADDR(1864) CNT(4) KCACHE0(CB1:0-15) 35 x: SETGT_INT R2.x, 0.0f, KC0[3].x y: ADD_INT R0.y, KC0[6].x, (0x00000004, 5.605193857e-45f).x t: MULLO_INT R0.x, R5.y, KC0[2].x 08 MEM_SCRATCH_WRITE_ACK: VEC_PTR[61].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 09 ALU: ADDR(1868) CNT(35) KCACHE0(CB1:0-15) 36 x: MOV R0.x, R0.x t: MOV R2.x, R2.x 37 x: SETGT_INT R3.x, KC0[1].x, 0.0f y: ADD_INT T1.y, KC0[1].x, PV36.x w: ADD_INT ____, KC0[3].x, PS36 38 x: LSHR R0.x, R0.y, (0x00000002, 2.802596929e-45f).x z: XOR_INT T0.z, R2.x, PV37.w t: LSHR R1.x, KC0[6].x, (0x00000002, 2.802596929e-45f).x 39 t: RCP_UINT T0.y, PV38.z 40 t: MULLO_UINT T0.x, T0.z, PS39 41 z: SUB_INT ____, 0.0f, PS40 t: MULHI_UINT T0.w, T0.z, T0.y 42 y: CNDE_INT ____, PS41, PV41.z, T0.x 43 t: MULHI_UINT ____, PV42.y, T0.y 44 z: ADD_INT ____, T0.y, PS43 w: SUB_INT ____, T0.y, PS43 45 y: CNDE_INT ____, T0.w, PV44.z, PV44.w 46 t: MULHI_UINT T0.x, PV45.y, (0x00100000, 1.469367939e-39f).x 47 x: ADD_INT T1.x, PS46, 1 w: ADD_INT T0.w, -1, PS46 t: MULLO_UINT ____, PS46, T0.z 48 y: SETGE_UINT T0.y, (0x00100000, 1.469367939e-39f).x, PS47 z: SUB_INT ____, (0x00100000, 1.469367939e-39f).x, PS47 49 w: SETGE_UINT ____, PV48.z, T0.z 50 z: AND_INT ____, T0.y, PV49.w 51 y: CNDE_INT ____, PV50.z, T0.x, T1.x 52 z: CNDE_INT ____, T0.y, T0.w, PV51.y 53 y: CNDE_INT ____, T0.z, -1, PV52.z 54 w: ADD_INT ____, R2.x, PV53.y 55 z: XOR_INT ____, R2.x, PV54.w 56 z: SUB_INT R0.z, PV55.z, KC0[1].x w: SETGT_INT R0.w, T1.y, PV55.z 10 WAIT_ACK: Outstanding_acks <= 0 11 TEX: ADDR(9890) CNT(1) 57 RD_SCRATCH R2.x___, VEC_PTR[61], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 12 ALU: ADDR(1903) CNT(3) 58 z: MOV R0.z, R0.z w: MOV R0.w, R0.w 59 x: CNDE_INT R2.x, PV58.w, R2.x, PV58.z 13 MEM_SCRATCH_WRITE_ACK: VEC_PTR[59].x___, R2, ARRAY_SIZE(66) ELEM_SIZE(3) 14 WAIT_ACK: Outstanding_acks <= 0 15 TEX: ADDR(9892) CNT(2) 60 RD_SCATTER R7.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 61 RD_SCATTER R0.x___, DWORD_PTR[0+R1.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 16 MEM_SCRATCH_WRITE_ACK: VEC_PTR[43].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 17 ALU_PUSH_BEFORE: ADDR(1906) CNT(1) 62 x: PREDNE_INT ____, R3.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 18 JUMP ADDR(30) 19 ALU: ADDR(1907) CNT(2) KCACHE0(CB1:0-15) 63 x: ASHR R2.x, KC0[3].x, (0x00000002, 2.802596929e-45f).x 20 WAIT_ACK: Outstanding_acks <= 0 21 TEX: ADDR(9896) CNT(1) 64 RD_SCRATCH R1.x___, VEC_PTR[60], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 22 TEX: ADDR(9898) CNT(1) 65 RD_SCRATCH R0.x___, VEC_PTR[59], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 23 ALU: ADDR(1909) CNT(18) KCACHE0(CB1:0-15) 66 x: LSHL R5.x, R2.x, (0x00000004, 5.605193857e-45f).x VEC_120 w: ADD_INT ____, R1.x, KC0[11].x t: MULLO_INT ____, R0.x, R2.x 67 x: MOV R2.x, 0.0f y: MOV R0.y, 0.0f z: ADD_INT ____, PS66, PV66.w w: MOV R0.w, 0.0f t: MOV R0.z, 0.0f 68 y: LSHL ____, PV67.z, (0x00000004, 5.605193857e-45f).x 69 x: ADD_INT R0.x, KC0[8].x, PV68.y 70 x: MOV R1.x, PV69.x 71 x: MOV R2.x, R2.x y: MOV R2.y, R0.y z: MOV R2.z, R0.z w: MOV R2.w, R0.w 72 x: MOV R0.x, R6.x 24 LOOP_DX10 i0 FAIL_JUMP_ADDR(30) 25 ALU: ADDR(1927) CNT(5) 73 x: ADD_INT R4.x, -1, R0.x t: LSHR R0.x, R1.x, (0x00000002, 2.802596929e-45f).x 74 x: SETNE_INT R3.x, PV73.x, 0.0f t: ADD_INT R1.x, R5.x, R1.x 26 TEX: ADDR(9900) CNT(1) 75 RD_SCATTER R0, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED 27 ALU_BREAK: ADDR(1932) CNT(5) 76 x: ADD R0.x, R2.x, R0.x y: ADD R4.y, R2.y, R0.y z: ADD R0.z, R2.z, R0.z w: ADD R0.w, R2.w, R0.w 77 x: PREDNE_INT ____, R3.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 28 ALU: ADDR(1937) CNT(5) 78 x: MOV R2.x, R0.x y: MOV R2.y, R4.y z: MOV R2.z, R0.z w: MOV R2.w, R0.w 79 x: MOV R0.x, R4.x 29 ENDLOOP i0 PASS_JUMP_ADDR(25) 30 ELSE POP_CNT(1) ADDR(32) 31 ALU_POP_AFTER: ADDR(1942) CNT(5) 80 x: MOV R0.x, 0.0f 81 x: MOV R0.x, PV80.x y: MOV R4.y, R22.y z: MOV R0.z, R11.z w: MOV R0.w, R11.w 32 ALU: ADDR(1947) CNT(88) KCACHE0(CB1:0-15) 82 x: AND_INT T3.x, R0.x, (0x80000000, -0.0f).x y: AND_INT T2.y, R0.x, (0x7F800000, 1.#INFf).y z: MOV T2.z, R0.w w: AND_INT T3.w, R0.x, (0x807FFFFF, -1.175494211e-38f).z t: I_TO_F R0.w, KC0[1].x 83 x: MOV T0.x, PS82 y: AND_INT T0.y, PV82.z, (0x7F800000, 1.#INFf).x z: AND_INT T0.z, PV82.z, (0x807FFFFF, -1.175494211e-38f).y w: AND_INT T0.w, PV82.z, (0x80000000, -0.0f).z t: MOV R0.y, PS82 84 x: AND_INT ____, PV83.x, (0x807FFFFF, -1.175494211e-38f).x y: SETE_INT T1.y, PV83.y, (0x7F800000, 1.#INFf).y z: SETE_INT T1.z, PV83.y, 0.0f w: AND_INT R1.w, PV83.x, (0x7F800000, 1.#INFf).y t: AND_INT T1.w, PV83.x, (0x80000000, -0.0f).z 85 x: SETE_INT ____, PV84.w, 0.0f y: OR_INT ____, PV84.x, (0x3F800000, 1.0f).x z: OR_INT ____, T0.z, (0x3F800000, 1.0f).x w: SETE_INT T2.w, PV84.w, (0x7F800000, 1.#INFf).y t: SUB_INT T1.x, T0.y, PV84.w 86 x: OR_INT ____, T1.y, PV85.w y: CNDE_INT ____, T1.z, PV85.z, T0.w z: OR_INT ____, T1.z, PV85.x w: CNDE_INT ____, PV85.x, PV85.y, T1.w VEC_021 t: AND_INT T2.x, R0.y, (0x7F800000, 1.#INFf).x 87 x: CNDE_INT ____, T2.w, PV86.w, T0.x y: OR_INT T3.y, PV86.x, PV86.z z: CNDE_INT T2.z, T1.y, PV86.y, T2.z w: SETE_INT T2.w, T2.y, (0x7F800000, 1.#INFf).x VEC_120 t: SETE_INT T1.y, PS86, (0x7F800000, 1.#INFf).x 88 x: SETE_INT ____, T2.y, 0.0f y: CNDE_INT T0.y, PV87.y, T1.x, 0.0f z: OR_INT ____, T3.w, (0x3F800000, 1.0f).x w: SETE_INT ____, T2.x, 0.0f t: RCP_e R4.w, PV87.x 89 x: CNDE_INT ____, PV88.x, PV88.z, T3.x y: ASHR T1.y, PV88.y, (0x00000017, 3.222986468e-44f).x z: MUL_e ____, T2.z, PS88 w: OR_INT ____, T1.y, T2.w t: OR_INT ____, PV88.w, PV88.x 90 x: OR_INT R1.x, PV89.w, PS89 y: AND_INT ____, (0x7FFFFFFF, 1.#QNANf).x, PV89.z z: AND_INT T2.z, (0x80000000, -0.0f).y, PV89.z w: CNDE_INT ____, T2.w, PV89.x, R0.x t: ADD_INT T3.x, T0.y, PV89.z 91 x: SUB_INT ____, T2.y, R1.w y: MUL_e T2.y, R4.w, PV90.w z: OR_INT T1.z, PV90.z, (0x7F800000, 1.#INFf).x w: ASHR ____, PV90.y, (0x00000017, 3.222986468e-44f).y t: MOV T2.w, R0.z 92 x: AND_INT ____, (0x7FFFFFFF, 1.#QNANf).x, PV91.y y: ADD_INT ____, PV91.w, T1.y z: CNDE_INT ____, R1.x, PV91.x, 0.0f w: AND_INT R2.w, (0x80000000, -0.0f).y, PV91.y t: MOV T0.z, R0.w 93 x: SETGE_INT ____, PV92.y, (0x000000FF, 3.573311084e-43f).x y: ASHR ____, PV92.z, (0x00000017, 3.222986468e-44f).y z: ASHR ____, PV92.x, (0x00000017, 3.222986468e-44f).y w: SETGE_INT ____, 0.0f, PV92.y t: ADD_INT R2.x, PV92.z, T2.y 94 x: ADD_INT ____, PV93.z, PV93.y y: OR_INT R1.y, R2.w, (0x7F800000, 1.#INFf).x z: CNDE_INT ____, PV93.w, T3.x, T2.z w: CNDE_INT ____, T3.y, PV93.x, 0.0f t: AND_INT R2.y, T2.w, (0x7F800000, 1.#INFf).x 95 x: CNDE_INT R0.x, PV94.w, PV94.z, T1.z y: SETGE_INT R0.y, PV94.x, (0x000000FF, 3.573311084e-43f).x z: SETGE_INT R2.z, 0.0f, PV94.x w: AND_INT R6.w, T0.z, (0x7F800000, 1.#INFf).y t: AND_INT R3.w, T2.w, (0x807FFFFF, -1.175494211e-38f).z 33 MEM_SCRATCH_WRITE: VEC_PTR[46].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 34 ALU: ADDR(2035) CNT(15) 96 y: MOV R0.y, R0.y z: MOV R2.z, R2.z w: MOV R6.w, R6.w 97 x: SETE_INT R2.x, R2.y, (0x7F800000, 1.#INFf).x y: CNDE_INT ____, PV96.z, R2.x, R2.w z: CNDE_INT ____, R1.x, PV96.y, 0.0f w: SETE_INT R2.w, PV96.w, (0x7F800000, 1.#INFf).x t: SETE_INT R1.x, R2.y, 0.0f 98 x: CNDE_INT R0.x, PV97.z, PV97.y, R1.y y: SETE_INT R0.y, R6.w, 0.0f z: AND_INT R2.z, R0.z, (0x80000000, -0.0f).x w: MOV R9.w, R0.x t: OR_INT R1.y, R3.w, (0x3F800000, 1.0f).y 35 MEM_SCRATCH_WRITE: VEC_PTR[54].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 36 ALU: ADDR(2050) CNT(58) 99 y: MOV R1.y, R1.y z: MOV R2.z, R2.z t: MOV R0.y, R0.y 100 x: CNDE_INT ____, R1.x, PV99.y, PV99.z y: OR_INT ____, PS99, R1.x z: OR_INT ____, R2.w, R2.x VEC_021 w: SUB_INT T3.w, R2.y, R1.w t: MOV ____, R4.y 101 x: OR_INT R3.x, PV100.z, PV100.y y: AND_INT R2.y, PS100, (0x7F800000, 1.#INFf).x z: MOV ____, R0.w w: CNDE_INT ____, R2.x, PV100.x, R0.z t: AND_INT R0.w, PS100, (0x807FFFFF, -1.175494211e-38f).y 102 x: AND_INT R1.x, PV101.z, (0x7F800000, 1.#INFf).x y: MUL_e R0.y, R4.w, PV101.w z: CNDE_INT R0.z, PV101.x, T3.w, 0.0f w: SETE_INT R2.w, PV101.y, (0x7F800000, 1.#INFf).x t: SETE_INT R1.y, PV101.y, 0.0f 103 x: MOV R1.x, R1.x y: MOV R0.y, R0.y z: MOV R0.z, R0.z 104 x: AND_INT R0.x, (0x7FFFFFFF, 1.#QNANf).x, PV103.y y: ASHR R0.y, PV103.z, (0x00000017, 3.222986468e-44f).y z: ADD_INT R0.z, PV103.z, PV103.y w: AND_INT R3.w, (0x80000000, -0.0f).z, PV103.y t: SETE_INT R2.x, PV103.x, (0x7F800000, 1.#INFf).w 105 x: AND_INT ____, R4.y, (0x80000000, -0.0f).x y: OR_INT T2.y, R3.w, (0x7F800000, 1.#INFf).y z: ASHR ____, R0.x, (0x00000017, 3.222986468e-44f).z w: SETE_INT ____, R1.x, 0.0f VEC_120 t: OR_INT ____, R0.w, (0x3F800000, 1.0f).w 106 x: OR_INT ____, PV105.w, R1.y y: CNDE_INT ____, R1.y, PS105, PV105.x VEC_120 z: OR_INT ____, R2.x, R2.w VEC_021 w: ADD_INT ____, PV105.z, R0.y VEC_102 t: SUB_INT T3.w, R2.y, R1.w 107 x: SETGE_INT ____, PV106.w, (0x000000FF, 3.573311084e-43f).x y: OR_INT R2.y, PV106.z, PV106.x z: SETGE_INT ____, 0.0f, PV106.w w: CNDE_INT ____, R2.w, PV106.y, R4.y 108 x: MUL_e ____, R4.w, PV107.w y: CNDE_INT ____, PV107.z, R0.z, R3.w z: CNDE_INT ____, PV107.y, T3.w, 0.0f w: CNDE_INT ____, R3.x, PV107.x, 0.0f 109 x: CNDE_INT R0.x, PV108.w, PV108.y, T2.y y: AND_INT R0.y, (0x7FFFFFFF, 1.#QNANf).x, PV108.x z: ASHR R0.z, PV108.z, (0x00000017, 3.222986468e-44f).y w: AND_INT R0.w, (0x80000000, -0.0f).z, PV108.x t: ADD_INT R1.y, PV108.z, PV108.x 37 MEM_SCRATCH_WRITE: VEC_PTR[45].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 38 ALU: ADDR(2108) CNT(14) 110 x: MOV R0.x, R0.x y: MOV R0.y, R0.y w: MOV R0.w, R0.w 111 x: ASHR ____, PV110.y, (0x00000017, 3.222986468e-44f).x y: OR_INT T1.y, PV110.w, (0x7F800000, 1.#INFf).y z: MOV R9.z, PV110.x 112 w: ADD_INT ____, PV111.x, R0.z 113 y: SETGE_INT ____, PV112.w, (0x000000FF, 3.573311084e-43f).x z: SETGE_INT ____, 0.0f, PV112.w 114 z: CNDE_INT ____, PV113.z, R1.y, R0.w w: CNDE_INT ____, R2.y, PV113.y, 0.0f 115 x: CNDE_INT R0.x, PV114.w, PV114.z, T1.y 39 MEM_SCRATCH_WRITE: VEC_PTR[44].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 40 ALU_PUSH_BEFORE: ADDR(2122) CNT(5) KCACHE0(CB1:0-15) 116 x: MOV R0.x, R0.x 117 y: MOV R20.y, PV116.x 118 x: SETE_INT R0.x, KC0[5].x, (0x00000003, 4.203895393e-45f).x 119 x: PREDNE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 41 JUMP ADDR(44) 42 ALU: ADDR(2127) CNT(14) KCACHE0(CB1:0-15) 120 z: LSHR ____, KC0[1].x, (0x0000001F, 4.344025239e-44f).x w: LSHL T0.w, KC0[1].x, 1 121 y: ADD_INT ____, KC0[1].x, PV120.z t: MULHI_INT ____, PV120.w, (0xAAAAAAAB, -3.031649096e-13f).x 122 x: ASHR R1.x, PV121.y, 1 y: ADD_INT ____, T0.w, PS121 123 x: LSHR ____, PV122.y, (0x0000001F, 4.344025239e-44f).x w: ASHR ____, PV122.y, 1 124 x: ADD_INT R0.x, PV123.x, PV123.w 125 x: MOV R4.x, PV124.x 126 x: MOV R0.x, R1.x 43 MEM_SCRATCH_WRITE_ACK: VEC_PTR[57].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 44 ELSE POP_CNT(1) ADDR(55) 45 ALU_PUSH_BEFORE: ADDR(2141) CNT(3) KCACHE0(CB1:0-15) 127 x: SETE_INT R0.x, KC0[5].x, (0x00000004, 5.605193857e-45f).x 128 x: PREDNE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 46 JUMP ADDR(48) 47 ALU: ADDR(2144) CNT(13) KCACHE0(CB1:0-15) 129 t: MULLO_INT T0.w, KC0[1].x, (0x00000003, 4.203895393e-45f).x 130 w: ASHR ____, PS129, (0x0000001F, 4.344025239e-44f).x t: MULHI_INT ____, PS129, (0x66666667, 2.720083202e23f).y 131 x: ASHR ____, PS130, 1 y: LSHR ____, PS130, (0x0000001F, 4.344025239e-44f).x z: LSHR ____, PV130.w, (0x0000001E, 4.203895393e-44f).y 132 x: ADD_INT R6.x, PV131.y, PV131.x y: ADD_INT ____, T0.w, PV131.z 133 x: ASHR R4.x, PV132.y, (0x00000002, 2.802596929e-45f).x 48 ELSE POP_CNT(1) ADDR(52) 49 ALU_PUSH_BEFORE: ADDR(2157) CNT(3) KCACHE0(CB1:0-15) 134 x: SETE_INT R4.x, KC0[5].x, (0x00000005, 7.006492322e-45f).x 135 x: PREDNE_INT ____, R4.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 50 JUMP POP_CNT(2) ADDR(52) 51 ALU_POP2_AFTER: ADDR(2160) CNT(14) KCACHE0(CB1:0-15) 136 x: LSHL T0.x, KC0[1].x, (0x00000002, 2.802596929e-45f).x 137 t: MULHI_INT ____, PV136.x, (0xAAAAAAAB, -3.031649096e-13f).x 138 z: ADD_INT ____, T0.x, PS137 t: MULHI_INT ____, T0.x, (0x66666667, 2.720083202e23f).x 139 x: ASHR ____, PV138.z, (0x00000002, 2.802596929e-45f).x y: LSHR ____, PV138.z, (0x0000001F, 4.344025239e-44f).y z: LSHR ____, PS138, (0x0000001F, 4.344025239e-44f).y w: ASHR ____, PS138, 1 140 x: ADD_INT R6.x, PV139.y, PV139.x t: ADD_INT R4.x, PV139.z, PV139.w 52 ALU: ADDR(2174) CNT(1) 141 x: MOV R0.x, R6.x 53 MEM_SCRATCH_WRITE_ACK: VEC_PTR[57].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 54 POP (1) ADDR(55) 55 WAIT_ACK: Outstanding_acks <= 0 56 TEX: ADDR(9902) CNT(1) 142 RD_SCRATCH R2.x___, VEC_PTR[60], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 57 TEX: ADDR(9904) CNT(1) 143 RD_SCRATCH R1.x___, VEC_PTR[59], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 58 ALU: ADDR(2175) CNT(7) KCACHE0(CB1:0-15) 144 x: ASHR R0.x, KC0[3].x, (0x00000002, 2.802596929e-45f).x 145 x: ADD_INT R2.x, R2.x, KC0[11].x y: ADD_INT R1.y, (0xF
if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;//return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; }
if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score || (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; }
raistmer,
When we use #if 1 we are calling the function sumtop2 inside a for loop.But Gpus still do not use stacks for function calling.Therefore Sumtop2 function is inlined everytime it is called,which drastically increases the size of your code.
Regarding the response of SKA:
It is a known issue and i confirm it as a bug.
For more info see the link :http://forums.amd.com/forum/messageview.cfm?catid=328&threadid=138904&forumid=9
Originally posted by: himanshu.gautam raistmer,
When we use #if 1 we are calling the function sumtop2 inside a for loop.But Gpus still do not use stacks for function calling.Therefore Sumtop2 function is inlined everytime it is called,which drastically increases the size of your code.
This implies that the loop is unrolled by the compiler which can only be done when the
loop start and end are known and fixed at compile time. I wrote two simple test cases
that demonstrate this in the first case the loop is completely unrolled, inlining the
function body multiple times, leading to larger
code without any loop contructs. In the second case the function body is inlined only
once leading to smaller code. It seems a real function calling mechanism is very
necessary otherwise you might just as well not use functions at all, as they are
merely macros.
int xxx(int i) { return i*2; } //*********** Loop is unrolled ***************** __kernel void test(__global int *x) { for (int i = 0; i <5; i++) { x = xxx(i); } } //*********** Loop is NOT unrolled ************* __kernel void test(__global int *x, int n) { for (int i = 0; i < n; i++) { x = xxx(i); } } Don't worry about the fact that x is overwritten by multiple kernel instances, not the point of this demo
unroling loop is regular optimization technique. this seems that compiler unroling too much.
Originally posted by: nou unroling loop is regular optimization technique. this seems that compiler unroling too much.
Yes, normally unrolling loops with a body that is not too large is good for performance,
but when you start inlining potentially large function bodies instead of function calls this
leads to code explosions. Implementing regular function calls would be the best option
IMO if possible. Otherwise something like this might help:
#pragma please_dont_unroll_this_next_loop
__kernel void PC_find_pulse_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs,int pass, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; int fft_len4=fft_len>>2; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; #define USE_AVG_CACHE 1 #if !USE_AVG_CACHE int i; // Calculate average power for (i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; #else avg=PulsePoT_average[ul_PoT+y*fft_len4]; #endif // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); break; } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 2* num_adds; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } __kernel void PC_find_pulse3_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 256*1024/fft_len4;//R: fft_len4 == fft_len>>2 int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; // Calculate average power for (int i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; PulsePoT_average[ul_PoT+y*fft_len4]=avg;//R: pulse3 called first, it computes average and stores it for pulse4 and pulse5 passes // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = 3; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; //int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1;//, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (6); perdiv = 2; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, 3, 0, PulseMax, t_funct_cache_fp) * avg; tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 3; cur_thresh = (dis_thresh / 3 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 6; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, 3, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } __kernel void PC_find_pulse4_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 256*1024/fft_len4; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len4];//R: Value computed in pulse3 // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = 4; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; //int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2;//, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (4); perdiv = 3; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, 4, 0, PulseMax, t_funct_cache_fp) * avg; tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 4.f; cur_thresh = (dis_thresh / 4.f - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 8; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, 4, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } __kernel void PC_find_pulse5_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 256*1024/fft_len4; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len4]; // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // //int num_adds = 5; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; //int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (3); perdiv = 4; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, 5, 0, PulseMax, t_funct_cache_fp) * avg; tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 5.f; cur_thresh = (dis_thresh / 5.f - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 10; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, 5, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = }
__kernel void PC_find_pulse5_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len4p, int ndivs, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset, __global float4* PulsePoT_average,int pass) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP int fft_len4=fft_len4p; const int PoTLen = 256*1024/fft_len4; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; avg=PulsePoT_average[ul_PoT+y*fft_len4]; // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (3); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= 5.f; cur_thresh = (dis_thresh / 5.f - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score ) { result_flag[0].x = 1;//return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } int num_adds_2 = 10; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)/(cur_thresh-avg); if (_snr.x > best_pulse_score || _snr.y > best_pulse_score || _snr.z > best_pulse_score || _snr.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;//return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = }
raistmer,
i am sorry if i am getting it wrong.
But AFAIK when #if 0 only sumtop2 is called, but when #if 1 both sumtop2_2 and sumtop2 are called.
tmp_max2 is initialised to 0 initially at #if 1 for every tmp_max2.x==0:
sumtop2_2() is executed
and the immidiate next if becomes false so sumtop2 in corrosponding else is also executed.
As such there are so many branch conditional so any waverfront which encounters this code will have to execute it many times,that too repeatedly in a for loop.Diverging branch conditions slow down execution disastrously.
// int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds_2)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds_2)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } }
#if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif