Please post your system config. Did you uninstall the previous drivers properly?
Failed kernel: //>>>>>>>> W/O FFA buffers filling for now __kernel void find_single_pulse_kernel_128(__global float* gpu_power, __constant float* thresh, __constant float* best_powers, __global float* best_powers_new, __global uint* results, const int num_dchunks, const int need_small_FFA_data,const int need_large_FFA_data, __global float* small_neg,__global float* small_pos, __global float* large_neg,__global float* large_pos) { //R: each work-item will handle separate data_chunk/dm/sign power array through all needed coadds //R: to avoid host to device data transfers (very costly for current implementation) //R: thresholds and best power values will be maintained separately for CPU and GPU //R: thresholds are constant, but best pulses will need update. //R: need_*_FFA_data are non-zero (if needed) offsets into gpu_power buffer //R: we store small FFA buffers (negative first, then positive) right after coadds buffer //R: (coadds buffer take same size as initial power array thought it actually slightly smaller, just to simplify //R: buffer size calculations for now) then larger FFA buffers go, first negative one, next positive, to match with //R: signs order for GPU dechirping. uint tid=get_global_id(0); uint dchunk=get_global_id(1); uint len4 = 32768 >> 3; uint4 coadd_offsets=(uint4)(0); uint4 coadd_begins=(uint4)(0); uint4 coadd_offsets2=(uint4)(0); uint4 coadd_begins2=(uint4)(0); float4 t=vload4(0,thresh);//thresholds for initial array and 3 first coadds float4 t2=vload4(1,thresh);//thresholds for next 4 coadds uint result=0;//R: will contain info about location of found pulses and best signals uint4 was_pulse=(uint4)(0); uint4 was_pulse2=(uint4)(0); uint m1=0; //R: low 16 bits for pulses above threshold, each coadd level coded as enabled bit in corresponding position //R: high 16 bits for best pulses, if were, again, enabled bit means found pulse //R: will do unroll for coadds to save reads from global memory float4 d[128]; coadd_begins.x=0;//R:initial array - no need to write, in gpu_power domain coadd_begins.y=6*32*(32768>>3); coadd_begins.z=coadd_begins.y+6*(32768>>4)*32; coadd_begins.w=coadd_begins.z+6*(32768>>5)*32; coadd_begins2.x=coadd_begins.w+6*(32768>>6)*32; coadd_begins2.y=coadd_begins2.x+6*(32768>>7)*32; coadd_begins2.z=coadd_begins2.y+6*(32768>>8)*32; coadd_begins2.w=coadd_begins2.z+6*(32768>>9)*32; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>3);//l==0 coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>4); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>5); coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>6); coadd_offsets2.x=coadd_begins2.x+(dchunk*32+tid)*(32768>>7); coadd_offsets2.y=coadd_begins2.y+(dchunk*32+tid)*(32768>>8); coadd_offsets2.z=coadd_begins2.z+(dchunk*32+tid)*(32768>>9); coadd_offsets2.w=coadd_begins2.w+(dchunk*32+tid)*(32768>>10); for(int m=0;m<len4;m+=128){//R:loading data piece of 128*4 samples m1=coadd_offsets.x+m; for(int i=0;i<128;i++){ d=vload4(m1+i,gpu_power); } if(was_pulse.x==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<128;i++){if ( (d.x>t.x)||(d.y>t.x)||(d.z>t.x)||(d.w>t.x) ){was_pulse.x=1;break;}} } //R: now do coadd and save coadded piece in memory m1=m>>1+coadd_offsets.y; for(int i=0;i<64;i++){ d.xy=d[2*i].xz+d[2*i].yw;d.zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d,m1,gpu_power); } //R: next coadd level ready to check for pulses if(was_pulse.y==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<64;i++){if ( (d.x>t.y)||(d.y>t.y)||(d.z>t.y)||(d.w>t.y) ){was_pulse.y=1;break;}} } //R: coadd level 2 starts m1=m>>2+coadd_offsets.z; for(int i=0;i<32;i++){ d.xy=d[2*i].xz+d[2*i].yw;d.zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d,m1,gpu_power); } if(was_pulse.z==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<32;i++){if ( (d.x>t.z)||(d.y>t.z)||(d.z>t.z)||(d.w>t.z) ){was_pulse.z=1;break;}} } //R: coadd level 3 starts m1=m>>3+coadd_offsets.w; for(int i=0;i<16;i++){ d.xy=d[2*i].xz+d[2*i].yw;d.zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d,m1,gpu_power); } if(was_pulse.w==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<16;i++){if ( (d.x>t.w)||(d.y>t.w)||(d.z>t.w)||(d.w>t.w) ){was_pulse.w=1;break;}} } //R: coadd level 4 starts m1=m>>4+coadd_offsets2.x; for(int i=0;i<8;i++){ d.xy=d[2*i].xz+d[2*i].yw;d.zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d,m1,gpu_power); } if(was_pulse2.x==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<8;i++){if ( (d.x>t2.x)||(d.y>t2.x)||(d.z>t2.x)||(d.w>t2.x) ){was_pulse2.x=1;break;}} } //R: coadd level 5 starts m1=m>>5+coadd_offsets2.y; for(int i=0;i<4;i++){ d.xy=d[2*i].xz+d[2*i].yw;d.zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d,m1,gpu_power); } if(was_pulse2.y==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<4;i++){if ( (d.x>t2.y)||(d.y>t2.y)||(d.z>t2.y)||(d.w>t2.y) ){was_pulse2.y=1;break;}} } //R: coadd level 6 starts m1=m>>6+coadd_offsets2.z; for(int i=0;i<2;i++){ d.xy=d[2*i].xz+d[2*i].yw;d.zw=d[2*i+1].xz+d[2*i+1].yw; vstore4(d,m1,gpu_power); } if(was_pulse2.z==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway for(int i=0;i<2;i++){if ( (d.x>t2.z)||(d.y>t2.z)||(d.z>t2.z)||(d.w>t2.z) ){was_pulse2.z=1;break;}} } //R: coadd level 7 starts m1=m>>7+coadd_offsets2.w; d[0].xy=d[0].xz+d[0].yw;d[0].zw=d[1].xz+d[1].yw; vstore4(d[0],m1,gpu_power); if(was_pulse2.w==0){//R: if pulse already detected no need to check other elements, it will be done on CPU anyway if ( (d[0].x>t2.w)||(d[0].y>t2.w)||(d[0].z>t2.w)||(d[0].w>t2.w) ){was_pulse2.w=1;} } }//for m result+=was_pulse.x+was_pulse.y<<1+was_pulse.z<<2+was_pulse.w<<3+ was_pulse2.x<<4+was_pulse2.y<<5+was_pulse2.z<<6+was_pulse2.w<<7; //R: now lets repeat for next few coadd levels and refill registers array from GPU memory t=vload4(2,thresh); was_pulse=(uint4)(0); coadd_begins.x=coadd_begins2.w;//R: will fetch data stored at the end of prev coadd bunch loop coadd_begins.y=coadd_begins.x+6*(32768>>10)*32; coadd_begins.z=coadd_begins.y+6*(32768>>11)*32; //coadd_begins.w=coadd_begins.z+6*(32768>>12)*32;; coadd_offsets.x=coadd_begins.x+(dchunk*32+tid)*(32768>>11); coadd_offsets.y=coadd_begins.y+(dchunk*32+tid)*(32768>>12); coadd_offsets.z=coadd_begins.z+(dchunk*32+tid)*(32768>>13); //coadd_offsets.w=coadd_begins.w+(dchunk*32+tid)*(32768>>10); for(int m=0;m<32;m++){//R: there is only 32x4 elements left in coadded array, no inner loop required m1=m+coadd_offsets.x; d
=vload4(m1,gpu_power); } //R: this coadd level analysed already, so just do coadd for(int m=0;m<16;m++){ m1=m+coadd_offsets.y; d .xy=d[2*m].xz+d[2*m].yw;d .zw=d[2*m+1].xz+d[2*m+1].yw; vstore4(d ,m1,gpu_power); }//R: coadd level 8 check for(int i=0;i<16;i++){if ( (d.x>t.x)||(d.y>t.x)||(d.z>t.x)||(d.w>t.x) ){was_pulse.x=1;break;}} //R:coadd level 9 (last) starts for(int m=0;m<8;m++){ m1=m+coadd_offsets.z; d .xy=d[2*m].xz+d[2*m].yw;d .zw=d[2*m+1].xz+d[2*m+1].yw; vstore4(d ,m1,gpu_power); }//R: coadd level 9 check for(int i=0;i<8;i++){if ( (d.x>t.y)||(d.y>t.y)||(d.z>t.y)||(d.w>t.y) ){was_pulse.y=1;break;}} result+=was_pulse.x<<8+was_pulse.y<<9; //R: last we do is to write result of pulse finding results[32*dchunk+tid]=result; } Place where driver restart occurs: err=clEnqueueReadBuffer(cq, gpu_power,CL_TRUE,0,sizeof(float)*(32768>>1),cpu_pinned_buf_ptr,0, NULL, NULL); fprintf(stderr,"first power array:\n"); float* power=(float*)cpu_pinned_buf_ptr; for(int i=0;i<32768>>1;i++) fprintf(stderr,"power[%d]=%0.7g\n",i,power); fprintf(stderr,"first coaddded array:\n"); err=clEnqueueReadBuffer(cq, gpu_power,CL_TRUE,sizeof(float)*32*DATA_CHUNK_UNROLL*(32768>>1),sizeof(float)*(32768>>2),cpu_pinned_buf_ptr,0, NULL, NULL); //HERE DRIVER RESTARTED power=(float*)cpu_pinned_buf_ptr; for(int i=0;i<32768>>2;i++) fprintf(stderr,"power_coadded[%d]=%0.7g\n",i,power); exit(0);
Raistmer,
Could you post the whole code? Its easy to reproduce the problem that way.
Thanks
I'm able to reproduce the issue with 10.3 drivers. The issue has been resolved and the testcase runs fine with going to be released drivers. You can expect to see the change in next release of drivers.
Raistmer,
That would make it easy for us to verify.
Raistmer,
The issue is resolved. You can expect to see the change in the upcoming release. Thanks for the feedback.
Getting this error with 10.4 as well.
Any news on this?
Are you getting the error with 2.1 SDK as well?