15 Replies Latest reply on May 10, 2010 3:48 PM by Raistmer

    Internal error: Link failed

    Raistmer
      what to do with it?

      After upgrade from Catalyst 9.2 to 10.3 I recived this error at program start:

      Error: Building Program (clBuildProgram)
      Internal error: Link failed.
      Make sure the system setup is correct.

      How to avoid this error?
        • Internal error: Link failed
          omkaranathan

          Please post your system config. Did you uninstall the previous drivers properly?

            • Internal error: Link failed
              Raistmer
              I reinstalled Cat 10.3 from scratch - didn't help.

              But now I know more interesting things about this issue.
              1) That error message appears only when CL file contain this kernel:

              2) When CL file compiled w/o that kernel, it builds ok, but app summons driver restart (!) at this place:
              (on second buffer read with non-zero offset).

              With Cat 9.2 kernel incorrectly overwrites buffer beginning, but no driver restarts. I provided info about this in another thread.

              Failed kernel: //>>>>>>>> W/O FFA buffers filling for now __kernel void find_single_pulse_kernel_128(__global float* gpu_power, __constant float* thresh, __constant float* best_powers, __global float* best_powers_new, __global uint* results, const int num_dchunks, const int need_small_FFA_data,const int need_large_FFA_data, __global float* small_neg,__global float* small_pos, __global float* large_neg,__global float* large_pos) { //R: each work-item will handle separate data_chunk/dm/sign power array through all needed coadds //R: to avoid host to device data transfers (very costly for current implementation) //R: thresholds and best power values will be maintained separately for CPU and GPU //R: thresholds are constant, but best pulses will need update. //R: need_*_FFA_data are non-zero (if needed) offsets into gpu_power buffer //R: we store small FFA buffers (negative first, then positive) right after coadds buffer //R: (coadds buffer take same size as initial power array thought it actually slightly smaller, just to simplify //R: buffer size calculations for now) then larger FFA buffers go, first negative one, next positive, to match with //R: signs order for GPU dechirping. uint tid=get_global_id(0); uint dchunk=get_global_id(1); uint len4 = 32768 >> 3; uint4 coadd_offsets=(uint4)(0); uint4 coadd_begins=(uint4)(0); uint4 coadd_offsets2=(uint4)(0); uint4 coadd_begins2=(uint4)(0); float4 t=vload4(0,thresh);//thresholds for initial array and 3 first coadds float4 t2=vload4(1,thresh);//thresholds for next 4 coadds uint result=0;//R: will contain info about location of found pulses and best signals uint4 was_pulse=(uint4)(0); uint4 was_pulse2=(uint4)(0); uint m1=0; //R: low 16 bits for pulses above threshold, each coadd level coded as enabled bit in corresponding position //R: high 16 bits for best pulses, if were, again, enabled bit means found pulse //R: will do unroll for coadds to save reads from global memory float4 d[128]; coadd_begins.x=0;//R:initial array - no need to write, in gpu_power domain coadd_begins.y=6*32*(32768>>3); coadd_begins.z=coadd_begins.y+6*(32768>>4)*32; coadd_begins.w=coadd_begins.z+6*(32768>>5)*32; coadd_begins2.x=coadd_begins.w+6*(32768>>6)*32; coadd_begins2.y=coadd_begins2.x+6*(32768>>7)*32; coadd_begins2.z=coadd_begins2.y+6*(32768>>8)*32; coadd_begins2.w=coadd_begins2.z+6*(32768>>9)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>3);//l==0 coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>4); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>5); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>6); coadd_offsets2.x=coadd_begins2.x+(dchunk*32+tid)*(32768>>7); coadd_offsets2.y=coadd_begins2.y+(dchunk*32+tid)*(32768>>8); coadd_offsets2.z=coadd_begins2.z+(dchunk*32+tid)*(32768>>9); coadd_offsets2.w=coadd_begins2.w+(dchunk*32+tid)*(32768>>10); for(int m=0;m<len4;m+=128){//R:loading data piece of 128*4 samples m1=coadd_offsets.x+m; for(int i=0;i<128;i++){ d[i]=vload4(m1+i,gpu_power); } if(was_pulse.x==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<128;i++){if ( (d[i].x>t.x)||(d[i].y>t.x)||(d[i].z>t.x)||(d[i].w>t.x) ){was_pulse.x=1;break;}} } //R: now do coadd and save coadded piece in memory m1=m>>1+coadd_offsets.y; for(int i=0;i<64;i++){ d[i].xy=d[2*i].xz+d[2*i].yw;d[i].zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d[i],m1,gpu_power); } //R: next coadd level ready to check for pulses if(was_pulse.y==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<64;i++){if ( (d[i].x>t.y)||(d[i].y>t.y)||(d[i].z>t.y)||(d[i].w>t.y) ){was_pulse.y=1;break;}} } //R: coadd level 2 starts m1=m>>2+coadd_offsets.z; for(int i=0;i<32;i++){ d[i].xy=d[2*i].xz+d[2*i].yw;d[i].zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d[i],m1,gpu_power); } if(was_pulse.z==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<32;i++){if ( (d[i].x>t.z)||(d[i].y>t.z)||(d[i].z>t.z)||(d[i].w>t.z) ){was_pulse.z=1;break;}} } //R: coadd level 3 starts m1=m>>3+coadd_offsets.w; for(int i=0;i<16;i++){ d[i].xy=d[2*i].xz+d[2*i].yw;d[i].zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d[i],m1,gpu_power); } if(was_pulse.w==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<16;i++){if ( (d[i].x>t.w)||(d[i].y>t.w)||(d[i].z>t.w)||(d[i].w>t.w) ){was_pulse.w=1;break;}} } //R: coadd level 4 starts m1=m>>4+coadd_offsets2.x; for(int i=0;i<8;i++){ d[i].xy=d[2*i].xz+d[2*i].yw;d[i].zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d[i],m1,gpu_power); } if(was_pulse2.x==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<8;i++){if ( (d[i].x>t2.x)||(d[i].y>t2.x)||(d[i].z>t2.x)||(d[i].w>t2.x) ){was_pulse2.x=1;break;}} } //R: coadd level 5 starts m1=m>>5+coadd_offsets2.y; for(int i=0;i<4;i++){ d[i].xy=d[2*i].xz+d[2*i].yw;d[i].zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d[i],m1,gpu_power); } if(was_pulse2.y==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<4;i++){if ( (d[i].x>t2.y)||(d[i].y>t2.y)||(d[i].z>t2.y)||(d[i].w>t2.y) ){was_pulse2.y=1;break;}} } //R: coadd level 6 starts m1=m>>6+coadd_offsets2.z; for(int i=0;i<2;i++){ d[i].xy=d[2*i].xz+d[2*i].yw;d[i].zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d[i],m1,gpu_power); } if(was_pulse2.z==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<2;i++){if ( (d[i].x>t2.z)||(d[i].y>t2.z)||(d[i].z>t2.z)||(d[i].w>t2.z) ){was_pulse2.z=1;break;}} } //R: coadd level 7 starts m1=m>>7+coadd_offsets2.w; d[0].xy=d[0].xz+d[0].yw;d[0].zw=d[1].xz+d[1].yw; vstore4(d[0],m1,gpu_power); if(was_pulse2.w==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway if ( (d[0].x>t2.w)||(d[0].y>t2.w)||(d[0].z>t2.w)||(d[0].w>t2.w) ){was_pulse2.w=1;} } }//for m result+=was_pulse.x+was_pulse.y<<1+was_pulse.z<<2+was_pulse.w<<3+ was_pulse2.x<<4+was_pulse2.y<<5+was_pulse2.z<<6+was_pulse2.w<<7; //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=vload4(2,thresh); was_pulse=(uint4)(0); coadd_begins.x=coadd_begins2.w;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+6*(32768>>10)*32; coadd_begins.z=coadd_begins.y+6*(32768>>11)*32; //coadd_begins.w=coadd_begins.z+6*(32768>>12)*32;; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>11); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>12); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>13); //coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>10); for(int m=0;m<32;m++){//R: there is only 32x4 elements left in coadded array, no inner loop required m1=m+coadd_offsets.x; d[m]=vload4(m1,gpu_power); } //R: this coadd level analysed already, so just do coadd for(int m=0;m<16;m++){ m1=m+coadd_offsets.y; d[m].xy=d[2*m].xz+d[2*m].yw;d[m].zw=d[2*m+1].xz+d[2*m+1].yw; vstore4(d[m],m1,gpu_power); }//R: coadd level 8 check for(int i=0;i<16;i++){if ( (d[i].x>t.x)||(d[i].y>t.x)||(d[i].z>t.x)||(d[i].w>t.x) ){was_pulse.x=1;break;}} //R:coadd level 9 (last) starts for(int m=0;m<8;m++){ m1=m+coadd_offsets.z; d[m].xy=d[2*m].xz+d[2*m].yw;d[m].zw=d[2*m+1].xz+d[2*m+1].yw; vstore4(d[m],m1,gpu_power); }//R: coadd level 9 check for(int i=0;i<8;i++){if ( (d[i].x>t.y)||(d[i].y>t.y)||(d[i].z>t.y)||(d[i].w>t.y) ){was_pulse.y=1;break;}} result+=was_pulse.x<<8+was_pulse.y<<9; //R: last we do is to write result of pulse finding results[32*dchunk+tid]=result; } Place where driver restart occurs: err=clEnqueueReadBuffer(cq, gpu_power,CL_TRUE,0,sizeof(float)*(32768>>1),cpu_pinned_buf_ptr,0, NULL, NULL); fprintf(stderr,"first power array:\n"); float* power=(float*)cpu_pinned_buf_ptr; for(int i=0;i<32768>>1;i++) fprintf(stderr,"power[%d]=%0.7g\n",i,power[i]); fprintf(stderr,"first coaddded array:\n"); err=clEnqueueReadBuffer(cq, gpu_power,CL_TRUE,sizeof(float)*32*DATA_CHUNK_UNROLL*(32768>>1),sizeof(float)*(32768>>2),cpu_pinned_buf_ptr,0, NULL, NULL); //HERE DRIVER RESTARTED power=(float*)cpu_pinned_buf_ptr; for(int i=0;i<32768>>2;i++) fprintf(stderr,"power_coadded[%d]=%0.7g\n",i,power[i]); exit(0);

            • Internal error: Link failed
              Raistmer
              Whole app? It takes ~800k in sources, just main part.
              I will try to construct some kind of test case, will see if it possible.
              • Internal error: Link failed
                Raistmer
                I sent test case for this problem via mail. Please, look at it.
                Same behavior as in main program.
                • Internal error: Link failed
                  Raistmer
                  Is it possible to get early access to this driver?

                  And what about initial problem - Internal error: Link failed - for my another kernel. It was repaired too or you need another test case for it?
                  • Internal error: Link failed
                    Raistmer
                    Ok, look for ticket 933 and corresponding E-mail with test case binaries.
                    Looks like size of CL file is limited...
                    • Internal error: Link failed
                      Raistmer
                      No, I don't getting this error with SDK 2.1, thanks.