Raistmer

Unexpectedly huge kernel size increase

Discussion created by Raistmer on Sep 5, 2010
Latest reply on Sep 10, 2010 by Raistmer
kernel listed

Hello
I trying to speedup big kernel by avoiding some global memory acesses.
For that I added few more registers (small amount) and few more branches (~2).
Also, 1 new function added to kernel.

But this slowed down kernel a lot instead of speedup.
SKA shows very huge kernel size increase for "optimised" variant.

Why so big increase here? Maybe some thoughts how it can be avoided?

EDIT: All assembly doesn't fit into code window. Please, try to cut-n-paste kernel in SKA and compile with #if 0 first and with #if 1 then (to enable sumtop2_2 usage)
Code:

float4 sumtop2(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0) { //R: this function working with 4 different frequency bins at once as all other pulse finding functions do float4 sum, tmax; int i; tmax = 0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; for (i = 0; i < di; i++) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; sum = i1 + i2; dest[i * (fft_len4)] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop2_2(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0,float4* tmax2) { //R: this function working with 4 different frequency bins at once as all other pulse finding functions do //R: also it adds/searches 2 levels of fold (stride add) at once float4 sum, tmax,sum2; int i; tmax = (float4)0.0f; *tmax2=(float4)0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; int stop=di-1; for (i = 0; i < stop; i++,stop--) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; float4 j1= one[stop * (fft_len4)]; float4 j2= two[stop * (fft_len4)]; sum = i1 + i2; tmax = max(tmax, sum); sum2=j1+j2; tmax = max(tmax, sum2); sum+=sum2; *tmax2=max(*tmax2,sum); dest[i * (fft_len4)] = sum;//R: second stride add stored } return tmax; } float4 sumtop3(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1) { float4 sum, tmax; int i; tmax = 0.0f; __global float4 *one = tab; __global float4 *two = tab + tmp0 * fft_len4; __global float4 *three = tab + tmp1 * (fft_len4); for (i = 0; i < di; i++) { float4 i1 = one[i * (fft_len4)]; float4 i2 = two[i * (fft_len4)]; float4 i3 = three[i * (fft_len4)]; sum = i1 + i2; sum += i3; dest[i * (fft_len4)] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop4(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1, int tmp2) { float4 sum, tmax; int i; __global float4 *one = tab; __global float4 *two = tab + tmp0 * (fft_len4); __global float4 *three = tab + tmp1 * (fft_len4); __global float4 *four = tab + tmp2 * (fft_len4); tmax = 0.0f; for (i = 0; i < di; i++) { float4 i1 = one[i * fft_len4]; float4 i2 = two[i * fft_len4]; float4 i3 = three[i * fft_len4]; float4 i4 = four[i * fft_len4]; sum = i1 + i2; sum += i3; sum += i4; dest[i * fft_len4] = sum; tmax = max(tmax, sum); } return tmax; } float4 sumtop5(__global float4 *tab, __global float4* dest, int di, int fft_len4, int tmp0, int tmp1, int tmp2, int tmp3) { float4 sum, tmax; int i; __global float4 *one = tab; __global float4 *two = tab + tmp0 * (fft_len4); __global float4 *three = tab + tmp1 * (fft_len4); __global float4 *four = tab + tmp2 * (fft_len4); __global float4 *five = tab + tmp3 * (fft_len4); tmax = 0.0f; for (i = 0; i < di; i++) { float4 i1 = one[i * fft_len4]; float4 i2 = two[i * fft_len4]; float4 i3 = three[i * fft_len4]; float4 i4 = four[i * fft_len4]; float4 i5 = five[i * fft_len4]; sum = i1 + i2; sum += i3; sum += i4; sum += i5; dest[i * fft_len4] = sum; tmax = max(tmax, sum); } return tmax; } typedef struct { //int NumDataPoints; // find_triplets //float *power_ft; //float4* results_ft; //result_flag* result_flags_ft; // find_pulse //float* PulsePot_fp; // Input data //float* PulsePot8_fp; // Input data moved 8 bytes forward for coleased reads //float* tmp_pot_fp; // Temporary array //float* best_pot_fp; // Copy folded pots with best score //float* report_pot_fp; // Copy folded pots for reporting //float4* results_fp; // Additional data for reporting //result_find_pulse_flag* result_flags_fp; //const float* t_funct_cache_fp; // cached results of cudaAcc_t_funct float rcfg_dis_thresh; int PulseMax; } PulseFind_t; #define FOLDS_COUNT 4 #define FOLDS_START 2 float4 t_funct(int di, int num_adds, int j, int PulseMax, __global float* t_funct_cache) { return (float4)t_funct_cache[j * PulseMax * FOLDS_COUNT + (num_adds - FOLDS_START) * PulseMax + di]; } __kernel void PC_find_pulse_kernel_cl(float best_pulse_score, int PulsePotLen, int AdvanceBy, int fft_len, int ndivs,int pass, __global PulseFind_t* settings,__global float* t_funct_cache_fp, __global float4* PoT,__global float4* tmp_PoT,__global uint4* result_flag,int offset) { if(result_flag[0].x == 1) return;//R: if CPU processing already needed - abort kernel ASAP const int PoTLen = 1024*1024/fft_len; int ul_PoT = get_global_id(0)+offset; int y = get_global_id(1); int TOffset1 = y * AdvanceBy; int TOffset2 = y * AdvanceBy; float rcfg_dis_thresh=settings->rcfg_dis_thresh; int PulseMax=settings->PulseMax; int fft_len4=fft_len>>2; //if (ul_PoT < 1) return; // Original find_pulse, omits first element //if (ul_PoT >= fft_len) return; if(TOffset1 + PulsePotLen > PoTLen) { TOffset1 = PoTLen - PulsePotLen; } __global float4* fp_PulsePot = PoT + ul_PoT + TOffset1 * (fft_len4); __global float4* tmp_pot = tmp_PoT + ul_PoT + TOffset2 * (fft_len4); //float* best_pot = cudaAcc_PulseFind_settings.best_pot_fp + ul_PoT + TOffset2 * fft_len; //float* report_pot = cudaAcc_PulseFind_settings.report_pot_fp + ul_PoT + TOffset2 * fft_len; int di; float4 avg=0;//,maxp=0;//, snr=0, fthresh=0; float4 tmp_max; #if !USE_AVG_CACHE int i; // Calculate average power for (i = 0; i < PulsePotLen; i++) { avg += fp_PulsePot[i * (fft_len4)]; } avg /= PulsePotLen; #else avg=PulsePoT_average[ul_PoT+y*fft_len4]; #endif // Periods from PulsePotLen/3 to PulsePotLen/4, and power of 2 fractions of. // then (len/4 to len/5) and finally (len/5 to len/6) // int num_adds = pass; //for(int num_adds = 3; num_adds <= 5; num_adds++) { int firstP, lastP; switch(num_adds) { case 3: lastP = (PulsePotLen*2)/3; firstP = (PulsePotLen*1)/2; break; case 4: lastP = (PulsePotLen*3)/4; firstP = (PulsePotLen*3)/5; break; case 5: lastP = (PulsePotLen*4)/5; firstP = (PulsePotLen*4)/6; break; } int num_adds_minus1 = num_adds - 1; int p = lastP; for (p = lastP ; p > firstP ; p--) { float4 cur_thresh, dis_thresh; int /*tabofst, */mper, perdiv; int tmp0, tmp1, tmp2, tmp3; //tabofst = ndivs*3+2-num_adds; mper = p * (12/num_adds_minus1); perdiv = num_adds_minus1; tmp0 = (int)((mper + 6)/12); // round(period) tmp1 = (int)((mper * 2 + 6)/12); // round(period*2) di = (int)p/perdiv; // (int)period //dis_thresh = cudaAcc_t_funct(di, num_adds)*avg; dis_thresh = t_funct(di, num_adds, 0, PulseMax, t_funct_cache_fp) * avg; switch(num_adds) { case 3: tmp_max = sumtop3(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1); break; case 4: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp_max = sumtop4(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2); break; case 5: tmp2 = (int)((mper * 3 + 6)/12); // round(period*3) tmp3 = (int)((mper * 4 + 6)/12); // round(period*4) tmp_max = sumtop5(fp_PulsePot, tmp_pot, di, fft_len4, tmp0, tmp1, tmp2, tmp3); break; } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds; cur_thresh = (dis_thresh / num_adds - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score ) { result_flag[0].x = 1;return; } if( (tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } } int num_adds_2 = 2* num_adds; // int j = 1; float4 tmp_max2=(float4)0.f; for (int j = 1; j < ndivs ; j++) { //perdiv *=2; tmp0 = di & 1; di >>= 1; tmp0 += di; dis_thresh = t_funct(di, num_adds, j,PulseMax,t_funct_cache_fp) * avg; #if 0 if( (j+1)<ndivs && tmp_max2.x==0.f){//R: use advanced sumtop then tmp_max= sumtop2_2(tmp_pot, tmp_pot, di, fft_len4, tmp0,&tmp_max2); } if(tmp_max2.x>0.f){ tmp_max=tmp_max2; tmp_max2=(float4)0.f; }else #endif { tmp_max = sumtop2(tmp_pot, tmp_pot, di, fft_len4, tmp0); } if (tmp_max.x>dis_thresh.x || tmp_max.y>dis_thresh.y || tmp_max.z>dis_thresh.z || tmp_max.w>dis_thresh.w) { // unscale for reporting tmp_max /= num_adds_2; cur_thresh = (dis_thresh / num_adds_2 - avg) * rcfg_dis_thresh + avg; float4 _snr = (tmp_max-avg)*(float4)sqrt((float)num_adds_2)/avg; float4 _thresh = (cur_thresh-avg)*(float4)sqrt((float)num_adds_2)/avg; if (_snr.x / _thresh.x > best_pulse_score || _snr.y / _thresh.y > best_pulse_score || _snr.z / _thresh.z > best_pulse_score || _snr.w / _thresh.w > best_pulse_score) { result_flag[0].x = 1;return; } if ((tmp_max.x>cur_thresh.x) || (tmp_max.y>cur_thresh.y) || (tmp_max.z>cur_thresh.z) || (tmp_max.w>cur_thresh.w) ) { result_flag[0].x = 1;return; } } num_adds_2 *=2; } // for (j = 1; j < ndivs } // for (p = lastP } // for(num_adds = } This kernel assembly for #if 0 case: ; -------- Disassembly -------------------- 00 ALU: ADDR(1792) CNT(58) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 0 y: AND_INT T3.y, R0.x, (0x000000FF, 3.573311084e-43f).x z: MOV R11.z, 0.0f w: MOV R11.w, 0.0f t: MOV R22.y, 0.0f 1 y: LSHR T2.y, R0.x, (0x00000008, 1.121038771e-44f).x t: RCP_UINT T0.w, KC0[2].x 2 x: MOV R6.x, KC1[1].x y: MOV R0.y, KC1[1].y z: MOV R0.z, KC1[1].z t: MULLO_UINT T0.z, KC0[2].x, PS1 3 x: SUB_INT R0.x, 0.0f, PS2 w: MOV R2.w, KC1[1].w t: MULHI_UINT T0.y, KC0[2].x, T0.w 4 w: CNDE_INT R0.w, PS3, PV3.x, T0.z t: RCP_UINT T1.z, KC0[2].y 5 t: MULHI_UINT ____, PV4.w, T0.w 6 x: ADD_INT ____, T0.w, PS5 y: SUB_INT ____, T0.w, PS5 t: MULLO_UINT T1.y, KC0[2].y, T1.z 7 x: SUB_INT ____, 0.0f, PS6 w: CNDE_INT ____, T0.y, PV6.x, PV6.y t: MULHI_UINT T0.x, KC0[2].y, T1.z 8 y: MOV R21.y, 1 z: CNDE_INT T0.z, PS7, PV7.x, T1.y t: MULHI_UINT T2.z, PV7.w, T2.y 9 y: ADD_INT T1.y, -1, PS8 z: MOV R10.z, 1 w: ADD_INT T0.w, PS8, 1 t: MULLO_UINT ____, PS8, KC0[2].x 10 x: SUB_INT T1.x, T2.y, PS9 w: SETGE_UINT T1.w, T2.y, PS9 t: MULHI_UINT ____, T0.z, T1.z 11 x: SUB_INT ____, T1.z, PS10 y: SETGE_UINT ____, PV10.x, KC0[2].x z: SUB_INT T1.z, PV10.x, KC0[2].x w: ADD_INT ____, T1.z, PS10 t: MOV R10.w, 1 12 x: AND_INT ____, T1.w, PV11.y z: CNDE_INT T0.z, T0.x, PV11.w, PV11.x 13 z: CNDE_INT ____, PV12.x, T2.z, T0.w w: CNDE_INT T0.w, PV12.x, T1.x, T1.z 14 x: CNDE_INT ____, T1.w, T1.y, PV13.z z: ADD_INT ____, KC0[2].x, PV13.w 15 y: CNDE_INT ____, T1.w, PV14.z, T0.w w: CNDE_INT T0.w, KC0[2].x, -1, PV14.x 16 x: CNDE_INT T1.x, KC0[2].x, -1, PV15.y t: MULHI_UINT ____, T0.z, PV15.w 17 t: MULLO_UINT ____, PS16, KC0[2].y 18 z: SETGE_UINT R0.z, T0.w, PS17 w: SUB_INT R0.w, T0.w, PS17 t: MULLO_INT ____, T1.x, KC0[1].x 19 x: SETGE_UINT ____, PV18.w, KC0[2].y y: SUB_INT R0.y, PV18.w, KC0[2].y z: ADD_INT ____, T3.y, PS18 20 x: ADD_INT R0.x, PV19.z, KC0[6].x w: AND_INT R1.w, R0.z, PV19.x 01 MEM_SCRATCH_WRITE_ACK: VEC_PTR[60].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 02 ALU: ADDR(1850) CNT(12) KCACHE0(CB0:0-15) KCACHE1(CB1:0-15) 21 w: MOV R1.w, R1.w 22 z: CNDE_INT T1.z, PV21.w, R0.w, R0.y 23 z: MOV T1.z, PV22.z 24 y: ADD_INT ____, KC0[2].y, PV23.z 25 w: CNDE_INT ____, R0.z, PV24.y, T1.z 26 z: CNDE_INT ____, KC0[2].y, -1, PV25.w 27 z: MOV ____, PV26.z 28 t: MULLO_INT ____, PV27.z, KC0[1].y 29 w: MOV ____, PS28 30 y: ADD_INT R5.y, PV29.w, KC0[6].y 31 x: LSHR R0.x, KC1[10].x, (0x00000002, 2.802596929e-45f).x 03 WAIT_ACK: Outstanding_acks <= 0 04 TEX: ADDR(9888) CNT(1) 32 RD_SCATTER R0.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 05 ALU_PUSH_BEFORE: ADDR(1862) CNT(2) 33 x: SETE_INT R0.x, R0.x, 1 34 x: PREDE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 06 JUMP POP_CNT(1) ADDR(1763) 07 ALU: ADDR(1864) CNT(4) KCACHE0(CB1:0-15) 35 x: SETGT_INT R2.x, 0.0f, KC0[3].x y: ADD_INT R0.y, KC0[6].x, (0x00000004, 5.605193857e-45f).x t: MULLO_INT R0.x, R5.y, KC0[2].x 08 MEM_SCRATCH_WRITE_ACK: VEC_PTR[61].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 09 ALU: ADDR(1868) CNT(35) KCACHE0(CB1:0-15) 36 x: MOV R0.x, R0.x t: MOV R2.x, R2.x 37 x: SETGT_INT R3.x, KC0[1].x, 0.0f y: ADD_INT T1.y, KC0[1].x, PV36.x w: ADD_INT ____, KC0[3].x, PS36 38 x: LSHR R0.x, R0.y, (0x00000002, 2.802596929e-45f).x z: XOR_INT T0.z, R2.x, PV37.w t: LSHR R1.x, KC0[6].x, (0x00000002, 2.802596929e-45f).x 39 t: RCP_UINT T0.y, PV38.z 40 t: MULLO_UINT T0.x, T0.z, PS39 41 z: SUB_INT ____, 0.0f, PS40 t: MULHI_UINT T0.w, T0.z, T0.y 42 y: CNDE_INT ____, PS41, PV41.z, T0.x 43 t: MULHI_UINT ____, PV42.y, T0.y 44 z: ADD_INT ____, T0.y, PS43 w: SUB_INT ____, T0.y, PS43 45 y: CNDE_INT ____, T0.w, PV44.z, PV44.w 46 t: MULHI_UINT T0.x, PV45.y, (0x00100000, 1.469367939e-39f).x 47 x: ADD_INT T1.x, PS46, 1 w: ADD_INT T0.w, -1, PS46 t: MULLO_UINT ____, PS46, T0.z 48 y: SETGE_UINT T0.y, (0x00100000, 1.469367939e-39f).x, PS47 z: SUB_INT ____, (0x00100000, 1.469367939e-39f).x, PS47 49 w: SETGE_UINT ____, PV48.z, T0.z 50 z: AND_INT ____, T0.y, PV49.w 51 y: CNDE_INT ____, PV50.z, T0.x, T1.x 52 z: CNDE_INT ____, T0.y, T0.w, PV51.y 53 y: CNDE_INT ____, T0.z, -1, PV52.z 54 w: ADD_INT ____, R2.x, PV53.y 55 z: XOR_INT ____, R2.x, PV54.w 56 z: SUB_INT R0.z, PV55.z, KC0[1].x w: SETGT_INT R0.w, T1.y, PV55.z 10 WAIT_ACK: Outstanding_acks <= 0 11 TEX: ADDR(9890) CNT(1) 57 RD_SCRATCH R2.x___, VEC_PTR[61], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 12 ALU: ADDR(1903) CNT(3) 58 z: MOV R0.z, R0.z w: MOV R0.w, R0.w 59 x: CNDE_INT R2.x, PV58.w, R2.x, PV58.z 13 MEM_SCRATCH_WRITE_ACK: VEC_PTR[59].x___, R2, ARRAY_SIZE(66) ELEM_SIZE(3) 14 WAIT_ACK: Outstanding_acks <= 0 15 TEX: ADDR(9892) CNT(2) 60 RD_SCATTER R7.x___, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 61 RD_SCATTER R0.x___, DWORD_PTR[0+R1.x], ELEM_SIZE(3) UNCACHED FORMAT(32_FLOAT) 16 MEM_SCRATCH_WRITE_ACK: VEC_PTR[43].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 17 ALU_PUSH_BEFORE: ADDR(1906) CNT(1) 62 x: PREDNE_INT ____, R3.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 18 JUMP ADDR(30) 19 ALU: ADDR(1907) CNT(2) KCACHE0(CB1:0-15) 63 x: ASHR R2.x, KC0[3].x, (0x00000002, 2.802596929e-45f).x 20 WAIT_ACK: Outstanding_acks <= 0 21 TEX: ADDR(9896) CNT(1) 64 RD_SCRATCH R1.x___, VEC_PTR[60], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 22 TEX: ADDR(9898) CNT(1) 65 RD_SCRATCH R0.x___, VEC_PTR[59], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 23 ALU: ADDR(1909) CNT(18) KCACHE0(CB1:0-15) 66 x: LSHL R5.x, R2.x, (0x00000004, 5.605193857e-45f).x VEC_120 w: ADD_INT ____, R1.x, KC0[11].x t: MULLO_INT ____, R0.x, R2.x 67 x: MOV R2.x, 0.0f y: MOV R0.y, 0.0f z: ADD_INT ____, PS66, PV66.w w: MOV R0.w, 0.0f t: MOV R0.z, 0.0f 68 y: LSHL ____, PV67.z, (0x00000004, 5.605193857e-45f).x 69 x: ADD_INT R0.x, KC0[8].x, PV68.y 70 x: MOV R1.x, PV69.x 71 x: MOV R2.x, R2.x y: MOV R2.y, R0.y z: MOV R2.z, R0.z w: MOV R2.w, R0.w 72 x: MOV R0.x, R6.x 24 LOOP_DX10 i0 FAIL_JUMP_ADDR(30) 25 ALU: ADDR(1927) CNT(5) 73 x: ADD_INT R4.x, -1, R0.x t: LSHR R0.x, R1.x, (0x00000002, 2.802596929e-45f).x 74 x: SETNE_INT R3.x, PV73.x, 0.0f t: ADD_INT R1.x, R5.x, R1.x 26 TEX: ADDR(9900) CNT(1) 75 RD_SCATTER R0, DWORD_PTR[0+R0.x], ELEM_SIZE(3) UNCACHED 27 ALU_BREAK: ADDR(1932) CNT(5) 76 x: ADD R0.x, R2.x, R0.x y: ADD R4.y, R2.y, R0.y z: ADD R0.z, R2.z, R0.z w: ADD R0.w, R2.w, R0.w 77 x: PREDNE_INT ____, R3.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 28 ALU: ADDR(1937) CNT(5) 78 x: MOV R2.x, R0.x y: MOV R2.y, R4.y z: MOV R2.z, R0.z w: MOV R2.w, R0.w 79 x: MOV R0.x, R4.x 29 ENDLOOP i0 PASS_JUMP_ADDR(25) 30 ELSE POP_CNT(1) ADDR(32) 31 ALU_POP_AFTER: ADDR(1942) CNT(5) 80 x: MOV R0.x, 0.0f 81 x: MOV R0.x, PV80.x y: MOV R4.y, R22.y z: MOV R0.z, R11.z w: MOV R0.w, R11.w 32 ALU: ADDR(1947) CNT(88) KCACHE0(CB1:0-15) 82 x: AND_INT T3.x, R0.x, (0x80000000, -0.0f).x y: AND_INT T2.y, R0.x, (0x7F800000, 1.#INFf).y z: MOV T2.z, R0.w w: AND_INT T3.w, R0.x, (0x807FFFFF, -1.175494211e-38f).z t: I_TO_F R0.w, KC0[1].x 83 x: MOV T0.x, PS82 y: AND_INT T0.y, PV82.z, (0x7F800000, 1.#INFf).x z: AND_INT T0.z, PV82.z, (0x807FFFFF, -1.175494211e-38f).y w: AND_INT T0.w, PV82.z, (0x80000000, -0.0f).z t: MOV R0.y, PS82 84 x: AND_INT ____, PV83.x, (0x807FFFFF, -1.175494211e-38f).x y: SETE_INT T1.y, PV83.y, (0x7F800000, 1.#INFf).y z: SETE_INT T1.z, PV83.y, 0.0f w: AND_INT R1.w, PV83.x, (0x7F800000, 1.#INFf).y t: AND_INT T1.w, PV83.x, (0x80000000, -0.0f).z 85 x: SETE_INT ____, PV84.w, 0.0f y: OR_INT ____, PV84.x, (0x3F800000, 1.0f).x z: OR_INT ____, T0.z, (0x3F800000, 1.0f).x w: SETE_INT T2.w, PV84.w, (0x7F800000, 1.#INFf).y t: SUB_INT T1.x, T0.y, PV84.w 86 x: OR_INT ____, T1.y, PV85.w y: CNDE_INT ____, T1.z, PV85.z, T0.w z: OR_INT ____, T1.z, PV85.x w: CNDE_INT ____, PV85.x, PV85.y, T1.w VEC_021 t: AND_INT T2.x, R0.y, (0x7F800000, 1.#INFf).x 87 x: CNDE_INT ____, T2.w, PV86.w, T0.x y: OR_INT T3.y, PV86.x, PV86.z z: CNDE_INT T2.z, T1.y, PV86.y, T2.z w: SETE_INT T2.w, T2.y, (0x7F800000, 1.#INFf).x VEC_120 t: SETE_INT T1.y, PS86, (0x7F800000, 1.#INFf).x 88 x: SETE_INT ____, T2.y, 0.0f y: CNDE_INT T0.y, PV87.y, T1.x, 0.0f z: OR_INT ____, T3.w, (0x3F800000, 1.0f).x w: SETE_INT ____, T2.x, 0.0f t: RCP_e R4.w, PV87.x 89 x: CNDE_INT ____, PV88.x, PV88.z, T3.x y: ASHR T1.y, PV88.y, (0x00000017, 3.222986468e-44f).x z: MUL_e ____, T2.z, PS88 w: OR_INT ____, T1.y, T2.w t: OR_INT ____, PV88.w, PV88.x 90 x: OR_INT R1.x, PV89.w, PS89 y: AND_INT ____, (0x7FFFFFFF, 1.#QNANf).x, PV89.z z: AND_INT T2.z, (0x80000000, -0.0f).y, PV89.z w: CNDE_INT ____, T2.w, PV89.x, R0.x t: ADD_INT T3.x, T0.y, PV89.z 91 x: SUB_INT ____, T2.y, R1.w y: MUL_e T2.y, R4.w, PV90.w z: OR_INT T1.z, PV90.z, (0x7F800000, 1.#INFf).x w: ASHR ____, PV90.y, (0x00000017, 3.222986468e-44f).y t: MOV T2.w, R0.z 92 x: AND_INT ____, (0x7FFFFFFF, 1.#QNANf).x, PV91.y y: ADD_INT ____, PV91.w, T1.y z: CNDE_INT ____, R1.x, PV91.x, 0.0f w: AND_INT R2.w, (0x80000000, -0.0f).y, PV91.y t: MOV T0.z, R0.w 93 x: SETGE_INT ____, PV92.y, (0x000000FF, 3.573311084e-43f).x y: ASHR ____, PV92.z, (0x00000017, 3.222986468e-44f).y z: ASHR ____, PV92.x, (0x00000017, 3.222986468e-44f).y w: SETGE_INT ____, 0.0f, PV92.y t: ADD_INT R2.x, PV92.z, T2.y 94 x: ADD_INT ____, PV93.z, PV93.y y: OR_INT R1.y, R2.w, (0x7F800000, 1.#INFf).x z: CNDE_INT ____, PV93.w, T3.x, T2.z w: CNDE_INT ____, T3.y, PV93.x, 0.0f t: AND_INT R2.y, T2.w, (0x7F800000, 1.#INFf).x 95 x: CNDE_INT R0.x, PV94.w, PV94.z, T1.z y: SETGE_INT R0.y, PV94.x, (0x000000FF, 3.573311084e-43f).x z: SETGE_INT R2.z, 0.0f, PV94.x w: AND_INT R6.w, T0.z, (0x7F800000, 1.#INFf).y t: AND_INT R3.w, T2.w, (0x807FFFFF, -1.175494211e-38f).z 33 MEM_SCRATCH_WRITE: VEC_PTR[46].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 34 ALU: ADDR(2035) CNT(15) 96 y: MOV R0.y, R0.y z: MOV R2.z, R2.z w: MOV R6.w, R6.w 97 x: SETE_INT R2.x, R2.y, (0x7F800000, 1.#INFf).x y: CNDE_INT ____, PV96.z, R2.x, R2.w z: CNDE_INT ____, R1.x, PV96.y, 0.0f w: SETE_INT R2.w, PV96.w, (0x7F800000, 1.#INFf).x t: SETE_INT R1.x, R2.y, 0.0f 98 x: CNDE_INT R0.x, PV97.z, PV97.y, R1.y y: SETE_INT R0.y, R6.w, 0.0f z: AND_INT R2.z, R0.z, (0x80000000, -0.0f).x w: MOV R9.w, R0.x t: OR_INT R1.y, R3.w, (0x3F800000, 1.0f).y 35 MEM_SCRATCH_WRITE: VEC_PTR[54].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 36 ALU: ADDR(2050) CNT(58) 99 y: MOV R1.y, R1.y z: MOV R2.z, R2.z t: MOV R0.y, R0.y 100 x: CNDE_INT ____, R1.x, PV99.y, PV99.z y: OR_INT ____, PS99, R1.x z: OR_INT ____, R2.w, R2.x VEC_021 w: SUB_INT T3.w, R2.y, R1.w t: MOV ____, R4.y 101 x: OR_INT R3.x, PV100.z, PV100.y y: AND_INT R2.y, PS100, (0x7F800000, 1.#INFf).x z: MOV ____, R0.w w: CNDE_INT ____, R2.x, PV100.x, R0.z t: AND_INT R0.w, PS100, (0x807FFFFF, -1.175494211e-38f).y 102 x: AND_INT R1.x, PV101.z, (0x7F800000, 1.#INFf).x y: MUL_e R0.y, R4.w, PV101.w z: CNDE_INT R0.z, PV101.x, T3.w, 0.0f w: SETE_INT R2.w, PV101.y, (0x7F800000, 1.#INFf).x t: SETE_INT R1.y, PV101.y, 0.0f 103 x: MOV R1.x, R1.x y: MOV R0.y, R0.y z: MOV R0.z, R0.z 104 x: AND_INT R0.x, (0x7FFFFFFF, 1.#QNANf).x, PV103.y y: ASHR R0.y, PV103.z, (0x00000017, 3.222986468e-44f).y z: ADD_INT R0.z, PV103.z, PV103.y w: AND_INT R3.w, (0x80000000, -0.0f).z, PV103.y t: SETE_INT R2.x, PV103.x, (0x7F800000, 1.#INFf).w 105 x: AND_INT ____, R4.y, (0x80000000, -0.0f).x y: OR_INT T2.y, R3.w, (0x7F800000, 1.#INFf).y z: ASHR ____, R0.x, (0x00000017, 3.222986468e-44f).z w: SETE_INT ____, R1.x, 0.0f VEC_120 t: OR_INT ____, R0.w, (0x3F800000, 1.0f).w 106 x: OR_INT ____, PV105.w, R1.y y: CNDE_INT ____, R1.y, PS105, PV105.x VEC_120 z: OR_INT ____, R2.x, R2.w VEC_021 w: ADD_INT ____, PV105.z, R0.y VEC_102 t: SUB_INT T3.w, R2.y, R1.w 107 x: SETGE_INT ____, PV106.w, (0x000000FF, 3.573311084e-43f).x y: OR_INT R2.y, PV106.z, PV106.x z: SETGE_INT ____, 0.0f, PV106.w w: CNDE_INT ____, R2.w, PV106.y, R4.y 108 x: MUL_e ____, R4.w, PV107.w y: CNDE_INT ____, PV107.z, R0.z, R3.w z: CNDE_INT ____, PV107.y, T3.w, 0.0f w: CNDE_INT ____, R3.x, PV107.x, 0.0f 109 x: CNDE_INT R0.x, PV108.w, PV108.y, T2.y y: AND_INT R0.y, (0x7FFFFFFF, 1.#QNANf).x, PV108.x z: ASHR R0.z, PV108.z, (0x00000017, 3.222986468e-44f).y w: AND_INT R0.w, (0x80000000, -0.0f).z, PV108.x t: ADD_INT R1.y, PV108.z, PV108.x 37 MEM_SCRATCH_WRITE: VEC_PTR[45].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 38 ALU: ADDR(2108) CNT(14) 110 x: MOV R0.x, R0.x y: MOV R0.y, R0.y w: MOV R0.w, R0.w 111 x: ASHR ____, PV110.y, (0x00000017, 3.222986468e-44f).x y: OR_INT T1.y, PV110.w, (0x7F800000, 1.#INFf).y z: MOV R9.z, PV110.x 112 w: ADD_INT ____, PV111.x, R0.z 113 y: SETGE_INT ____, PV112.w, (0x000000FF, 3.573311084e-43f).x z: SETGE_INT ____, 0.0f, PV112.w 114 z: CNDE_INT ____, PV113.z, R1.y, R0.w w: CNDE_INT ____, R2.y, PV113.y, 0.0f 115 x: CNDE_INT R0.x, PV114.w, PV114.z, T1.y 39 MEM_SCRATCH_WRITE: VEC_PTR[44].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 40 ALU_PUSH_BEFORE: ADDR(2122) CNT(5) KCACHE0(CB1:0-15) 116 x: MOV R0.x, R0.x 117 y: MOV R20.y, PV116.x 118 x: SETE_INT R0.x, KC0[5].x, (0x00000003, 4.203895393e-45f).x 119 x: PREDNE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 41 JUMP ADDR(44) 42 ALU: ADDR(2127) CNT(14) KCACHE0(CB1:0-15) 120 z: LSHR ____, KC0[1].x, (0x0000001F, 4.344025239e-44f).x w: LSHL T0.w, KC0[1].x, 1 121 y: ADD_INT ____, KC0[1].x, PV120.z t: MULHI_INT ____, PV120.w, (0xAAAAAAAB, -3.031649096e-13f).x 122 x: ASHR R1.x, PV121.y, 1 y: ADD_INT ____, T0.w, PS121 123 x: LSHR ____, PV122.y, (0x0000001F, 4.344025239e-44f).x w: ASHR ____, PV122.y, 1 124 x: ADD_INT R0.x, PV123.x, PV123.w 125 x: MOV R4.x, PV124.x 126 x: MOV R0.x, R1.x 43 MEM_SCRATCH_WRITE_ACK: VEC_PTR[57].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 44 ELSE POP_CNT(1) ADDR(55) 45 ALU_PUSH_BEFORE: ADDR(2141) CNT(3) KCACHE0(CB1:0-15) 127 x: SETE_INT R0.x, KC0[5].x, (0x00000004, 5.605193857e-45f).x 128 x: PREDNE_INT ____, R0.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 46 JUMP ADDR(48) 47 ALU: ADDR(2144) CNT(13) KCACHE0(CB1:0-15) 129 t: MULLO_INT T0.w, KC0[1].x, (0x00000003, 4.203895393e-45f).x 130 w: ASHR ____, PS129, (0x0000001F, 4.344025239e-44f).x t: MULHI_INT ____, PS129, (0x66666667, 2.720083202e23f).y 131 x: ASHR ____, PS130, 1 y: LSHR ____, PS130, (0x0000001F, 4.344025239e-44f).x z: LSHR ____, PV130.w, (0x0000001E, 4.203895393e-44f).y 132 x: ADD_INT R6.x, PV131.y, PV131.x y: ADD_INT ____, T0.w, PV131.z 133 x: ASHR R4.x, PV132.y, (0x00000002, 2.802596929e-45f).x 48 ELSE POP_CNT(1) ADDR(52) 49 ALU_PUSH_BEFORE: ADDR(2157) CNT(3) KCACHE0(CB1:0-15) 134 x: SETE_INT R4.x, KC0[5].x, (0x00000005, 7.006492322e-45f).x 135 x: PREDNE_INT ____, R4.x, 0.0f UPDATE_EXEC_MASK UPDATE_PRED 50 JUMP POP_CNT(2) ADDR(52) 51 ALU_POP2_AFTER: ADDR(2160) CNT(14) KCACHE0(CB1:0-15) 136 x: LSHL T0.x, KC0[1].x, (0x00000002, 2.802596929e-45f).x 137 t: MULHI_INT ____, PV136.x, (0xAAAAAAAB, -3.031649096e-13f).x 138 z: ADD_INT ____, T0.x, PS137 t: MULHI_INT ____, T0.x, (0x66666667, 2.720083202e23f).x 139 x: ASHR ____, PV138.z, (0x00000002, 2.802596929e-45f).x y: LSHR ____, PV138.z, (0x0000001F, 4.344025239e-44f).y z: LSHR ____, PS138, (0x0000001F, 4.344025239e-44f).y w: ASHR ____, PS138, 1 140 x: ADD_INT R6.x, PV139.y, PV139.x t: ADD_INT R4.x, PV139.z, PV139.w 52 ALU: ADDR(2174) CNT(1) 141 x: MOV R0.x, R6.x 53 MEM_SCRATCH_WRITE_ACK: VEC_PTR[57].x___, R0, ARRAY_SIZE(66) ELEM_SIZE(3) 54 POP (1) ADDR(55) 55 WAIT_ACK: Outstanding_acks <= 0 56 TEX: ADDR(9902) CNT(1) 142 RD_SCRATCH R2.x___, VEC_PTR[60], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 57 TEX: ADDR(9904) CNT(1) 143 RD_SCRATCH R1.x___, VEC_PTR[59], ARRAY_SIZE(66) ELEM_SIZE(3) UNCACHED 58 ALU: ADDR(2175) CNT(7) KCACHE0(CB1:0-15) 144 x: ASHR R0.x, KC0[3].x, (0x00000002, 2.802596929e-45f).x 145 x: ADD_INT R2.x, R2.x, KC0[11].x y: ADD_INT R1.y, (0xF

Outcomes