cancel
Showing results for 
Search instead for 
Did you mean: 

Archives Discussions

richeek_arya
Journeyman III

APP Kernel Analyzer gives no output

Hi all,

I have written a kernel code which is basically doing a Depth First Binary tree search.  The output of a search is list of K best candidates with corresponding distance from the 'Key' that was input to the search.

This kernel is not an optimized one and I want to know what are the things I could do to make it run as fast as possible. So I put this kernel code in the kernel analyzer. However I can only see N/A in every field. Also I am getting a warning "Warning: sphere_decoder kernel has register spilling. Lower performance is expected." , which I guess refers to that I may be using more private memory (registers). How to avoid it?

My kernel  calls  different functions and I have also implemented a stack. Could they further lower my performance? Would declaring functions inline speed up the processing. I have attached my full kernel code. I would highly appreciate if I can get suggestions to improve it further.

Thanks.

Richeek

#pragma OPENCL EXTENSION cl_amd_printf : enable #define Mt 4 #define Mr 4 #define MOD_SCHEME 16 #define bitSize 4 #define STACKSIZE 10 #define initial_SC 100000.0f #define CLIP 100000.0f typedef struct my_comp { float2 data; } comp; inline comp mul(comp num1, comp num2) { comp temp; temp.data.x = num1.data.x*num2.data.x - num1.data.y*num2.data.y; temp.data.y = num1.data.x*num2.data.y + num1.data.y*num2.data.x; return temp; } inline comp div(comp num1, comp num2) { comp temp; float mag = num2.data.x*num2.data.x + num2.data.y*num2.data.y; temp.data.x = (num1.data.x*num2.data.x + num1.data.y*num2.data.y)/mag; temp.data.y = (num1.data.y*num2.data.x - num1.data.x*num2.data.y)/mag; return temp; } inline comp add(comp num1, comp num2) { comp temp; temp.data.x = num1.data.x + num2.data.x; temp.data.y = num1.data.y + num2.data.y; return temp; } inline comp sub(comp num1, comp num2) { comp temp; temp.data.x = num1.data.x - num2.data.x; temp.data.y = num1.data.y - num2.data.y; return temp; } void equal(comp *num1, comp num2) { num1->data.x = num2.data.x; num1->data.y = num2.data.y; } typedef struct _node { int level, index; float ped; }node; typedef struct stack { node n[STACKSIZE]; int top; }stack_class; void init_stack(stack_class *s) { s->top = -1; } void push(stack_class *s, node _n) { s->top = s->top+1; if(s->top == STACKSIZE) printf("PROBLEM STACK IS FULL\n"); s->n[s->top] = _n; } node pop(stack_class *s) { if(s->top>=0) { node nn = s->n[s->top]; s->top = s->top - 1; return nn; } else { node temp; temp.index = -456; printf("STACK IS EMPTY CANT POP\n"); return temp; } } bool is_empty(stack_class *s) { if(s->top == -1) return true; return false; } /* Only for 16QAM */ void get_cord_val(int *qam_sym, comp *x) { int xVal=0, yVal=0; float xCor, yCor; for(int i=0;i<bitSize/2;i++) yVal = 2*yVal + qam_sym; for(int i=bitSize/2;i<bitSize;i++) xVal = 2*xVal + qam_sym; switch(xVal) { case 0: xCor=-3; break; case 1: xCor=-1; break; case 3: xCor=1; break; case 2: xCor=3; break; } switch(yVal) { case 0: yCor=-3; break; case 1: yCor=-1; break; case 3: yCor=1; break; case 2: yCor=3; break; } x->data.x = xCor; x->data.y = yCor; } void get_symbol(comp y, comp *parent_sig, __global float *R_re, __global float *R_im, int level, int tx, comp *cor) { int yRe, yIm; int min_x, max_x, min_y, max_y, j=0, i=0; comp y_scaled; equal(&y_scaled, y); comp temp, temp2; for(int i=level+1; i<tx; ++i) { temp.data.x = 1.0f*parent_sig.data.x; temp.data.y = 1.0f*parent_sig.data.y; temp2.data.x = R_re; temp2.data.y = R_im; y_scaled = sub(y_scaled,mul(temp2,temp)); } temp2.data.x = R_re[level]; temp2.data.y = R_im[level]; y_scaled = div(y_scaled, temp2); float y_re = y_scaled.data.x, y_im = y_scaled.data.y; if(MOD_SCHEME == 16) //16 QAM { if(y_re < -2.0f) yRe = -3; else if(y_re >= -2.0f && y_re < 0.0f) yRe = -1; else if(y_re >= 0.0f && y_re < 2.0f) yRe = 1; else yRe = 3; if(y_im < -2.0f) yIm = -3; else if(y_im >= -2.0f && y_im < 0.0f) yIm = -1; else if(y_im >= 0.0f && y_im < 2.0f) yIm = 1; else yIm = 3; cor[0].data.x = yRe; cor[0].data.y = yIm; j=1; min_x = min_y = -3; max_x = max_y = 3; i = 1; int k, flag = 0; while(j<16) { //cout<<"get symbol 2\n"; min_x = yRe-2*i; max_x = yRe+2*i; min_y = yIm-2*i; max_y = yIm+2*i; int min = min_x, max=max_x, x, y=max_y; k=min; flag = 0; //while box is not complete for(unsigned int l=0;l<4;++l) { while(k!=max) { if(!flag || flag == 2) { if(k >= -3 && k <= 3 && y >= -3 && y<= 3) { cor.data.x = k*1.0f; cor.data.y = y*1.0f; ++j; } } else if (flag == 1 || flag == 3) { if(k >= -3 && k <= 3 && x >= -3 && x<= 3) { cor.data.x = x; cor.data.y = k; ++j; } } if(!flag || flag == 3) k+= 2; else k-= 2; } if(!flag) { flag = 1; min = max_y; max = min_y; x = max_x; } else if(flag == 1) { flag = 2; min = max_x; max = min_x; y = min_y; } else if(flag == 2) { flag = 3; min = min_y; max = max_y; x = min_x; } k = min; } ++i; } } } float getPed(comp y, __global float *R_re, __global float *R_im, comp *parent_sig, comp sym, int curr_level, float ped_parent) { comp b, e; b.data.x = 0; b.data.y = 0; e.data.x = 0; e.data.y = 0; comp sig, temp2; for(int i=curr_level+1; i<Mt; ++i) { sig.data.x = 1.0f*parent_sig.data.x; sig.data.y = 1.0f*parent_sig.data.y; temp2.data.x = R_re; temp2.data.y = R_im; b = add(b, mul(temp2,sig)); } b = sub(y,b); temp2.data.x = R_re[curr_level]; temp2.data.y = R_im[curr_level]; sig = mul(temp2,sym); e = sub(b,sig); float val = ped_parent + (e.data.x*e.data.x + e.data.y*e.data.y); return val; } void get_bits(comp qam_sym, int *bits) { if(MOD_SCHEME == 16) { if(qam_sym.data.y == -3) { bits[0] = bits[1] = 0; } else if(qam_sym.data.y == -1) { bits[0] = 0; bits[1] = 1; } else if(qam_sym.data.y == 1) { bits[0] = bits[1] = 1; } else //if(qam_sym.imag() == 3) { bits[0] = 1; bits[1] = 0; } if(qam_sym.data.x == -3) { bits[2] = bits[3] = 0; } else if(qam_sym.data.x == -1) { //bits[2] = 1; bits[3] = 0; bits[2] = 0; bits[3] = 1; } else if(qam_sym.data.x == 1) { bits[2] = bits[3] = 1; } else //if(qam_sym.real() == 3) { bits[2] = 1; bits[3] = 0; } } } __kernel void sphere_decoder(const int block_length, const float noise_power, __global float *block_data, const int LIST_SIZE, __global float *llr, __global float *cand_dist, __global float *cand_sym, __global float *R_re, __global float *R_im, __global float *qr_noise_re, __global float *qr_noise_im) { //printf("f hello \n"); uint tid = get_global_id(0); //printf("f hello %d\n",tid); int bitstream[Mt*bitSize], stride = Mt*bitSize; for(int i=0; i<stride; ++i) bitstream = block_data[tid*stride + i]; comp x[Mt], y[Mr], best_sig[Mr], parents[Mr]; int level = Mt-1, index_level[Mr]; for(int i = 0; i<Mt; ++i) { get_cord_val(bitstream+i*Mt, &x); } //// generate the y vector // comp temp_const0, temp2; temp_const0.data.x = 0.0f; temp_const0.data.y = 0.0f; for(int i=0; i<Mr; ++i) { equal(&y,temp_const0); equal(&best_sig,temp_const0); equal(&parents,temp_const0); index_level = 0; for(int j=0; j<Mt; ++j) { temp2.data.x = R_re[i*Mt+j]; temp2.data.y = R_im[i*Mt+j]; y = add(y,mul(temp2, x)); // printf("IN LOOP %f+j%f\n",y.data.x, y.data.y); } // adding noise here temp2.data.x = qr_noise_re; temp2.data.y = qr_noise_im; } //for(int i=0;i<16;++i) stack_class s; init_stack(&s); node nn; nn.level = level+1; nn.index = -1; nn.ped = 0.0f; push(&s, nn); float ped_parent = 0.0f, SC_ML = initial_SC, max_cand_dist = 0.0f, bit_plus_1_dist, bit_minus_1_dist; comp cor[Mt*MOD_SCHEME]; int list_size = 0, max_cand_index = 0, i, j, curr_level = nn.level, temp_bits[bitSize]; while(!is_empty(&s)) { for(j=curr_level-1; j>=0; --j) { get_symbol(y, parents, R_re+j*Mt, R_im+j*Mt, j, Mt, cor+j*MOD_SCHEME); nn.ped = getPed(y, R_re+j*Mt, R_im+j*Mt, parents, cor[j*MOD_SCHEME + index_level], j, ped_parent); equal(&parents,cor[j*MOD_SCHEME+index_level]); nn.index = index_level++; nn.level = j; ped_parent = nn.ped; if(j == 0 && nn.ped < SC_ML) //reached the leaf node { if(list_size < LIST_SIZE) { cand_dist[tid*LIST_SIZE + list_size] = nn.ped; if(cand_dist[tid*LIST_SIZE + list_size] > max_cand_dist) { max_cand_dist = cand_dist[tid*LIST_SIZE + list_size]; max_cand_index = list_size; } for(int k=0; k<Mt; k++) { get_bits(cor[k*MOD_SCHEME+index_level-1],temp_bits); for(int l=0;l<bitSize;++l) { cand_sym[list_size*block_length+ tid*stride + k*bitSize+l] = (float)temp_bits; if(temp_bits==1) cand_sym[list_size*block_length+ tid*stride + k*bitSize+l] = -1.0f; else if(temp_bits==0) cand_sym[list_size*block_length+ tid*stride + k*bitSize+l] = 1.0f; } } list_size++; } else if(nn.ped < max_cand_dist) { cand_dist[max_cand_index+tid*LIST_SIZE] = nn.ped; /* Replace this candidate */ for(int k=0; k<Mt; ++k) { get_bits(cor[k*MOD_SCHEME+index_level-1],temp_bits); for(int l=0;l<bitSize;++l) { cand_sym[max_cand_index*block_length+ tid*stride + k*bitSize+l] = (float)temp_bits; if(temp_bits==1) cand_sym[max_cand_index*block_length+ tid*stride + k*bitSize+l] = -1.0f; if(temp_bits==0) cand_sym[max_cand_index*block_length+ tid*stride + k*bitSize+l] = 1.0f; } } /* find the next max candidate */ max_cand_dist = -1.0f; for(int k=0; k<LIST_SIZE; ++k) { if(cand_dist[tid*LIST_SIZE + k] > max_cand_dist) { max_cand_dist = cand_dist[tid*LIST_SIZE + k]; max_cand_index = k; } } } if(list_size == LIST_SIZE) SC_ML = max_cand_dist; } else if(nn.ped >= SC_ML && list_size == LIST_SIZE) //tree pruning { break; } if(j>=1) push(&s, nn); }//end for curr_level nn = pop(&s); curr_level = nn.level; ped_parent = nn.ped; while(nn.index >= MOD_SCHEME) { index_level[nn.level] = 0; nn = pop(&s); } /* going to the next child */ /* set the level for the children below curr level to zero */ for(i=curr_level-2;i>=0; --i) index_level = 0; if(index_level[curr_level-1] < MOD_SCHEME-1 || (nn.index == -1 && index_level[curr_level-1] == MOD_SCHEME-1)) push(&s,nn); } //end of while /* calculate LLRs here */ for(int l=0; l<stride; ++l) //for each bit of the MIMO symbol { bit_plus_1_dist = bit_minus_1_dist = INT_MAX*1.0f; for(int k = 0; k<LIST_SIZE; ++k) { if(cand_sym[k*block_length+ tid*stride +l] == -1.0f && cand_dist[tid*LIST_SIZE+k] < bit_minus_1_dist) { bit_minus_1_dist = cand_dist[tid*LIST_SIZE+k]; } else if(cand_sym[k*block_length+ tid*stride +l] == 1.0f && cand_dist[tid*LIST_SIZE+k] < bit_plus_1_dist ) { bit_plus_1_dist = cand_dist[tid*LIST_SIZE+k]; } } llr[tid*stride + l] = (1/noise_power)*(bit_minus_1_dist - bit_plus_1_dist); if(tid == 0) { printf("LLRs:%d %f %f %f %f\n",l, bit_plus_1_dist, bit_minus_1_dist, llr[tid*stride+l], cand_sym[1920]); } if(llr[tid*stride+l] > CLIP) llr[tid*stride + l] = CLIP; else if(llr[tid*stride+l]<-CLIP) llr[tid*stride+l] = -CLIP; } }

0 Likes
6 Replies
richeek_arya
Journeyman III

System Information: ATI Radeon 5450 Graphics Card

                               AMD APP Kernel Analyzer 1.8

                               AMD APP SDK v2.4

 


0 Likes

Thanks for reporting the issue.

0 Likes

Hi,

I have been able to reduce scratch register usage so the register spilling warning is no longer coming.  However kernel analyzer still gives not output.

I have another strange problem, when I increase the thread count my simulation just freezes down and I have to reboot my system. Does any one know what may be causing it?? Could there be a bug in the kernel?

Regards,

Richeek

0 Likes
pasniak
Journeyman III

I get the same issue when I compile a code structured as

for_loop

     switch x

      case a

          for_loop

     case b

          for_loop2

0 Likes

commenting out switch x or for_loop2 makes N/A disappear

and commenting out the outermost loop makes KernelAnalyzer crash

0 Likes

I frequently get N/A for compiler statistics...infact I haven't had a kernel statistic in a long time.

I understand this will usually happen with loops where it is difficult for the compiler to predict the 'average' or typical execution time. However while I understand it may not be possible to have figures like threads/sec or meaningful ALU/MEM bottleneck indicators...it would be nice to atleast have a total ALU and MEM instructions for my kernel to allow me to perform simple arithmetic optimisations (particularly with VLIW where it is difficult to envision how the compiler packs instructions)

0 Likes