The following kernels are run with the identical setup (Workgroupsize 64) onn CPU or GPU.
The second kernel stalls on the Juniper GPU and kills the graphics driver (black screen).
Both kernels are functional identical and build up histogramm information.
Everything works as expected on the Phenom II X4 CPU.
What am I doing wrong?
#define RADIX_BITS 4 #define HISTO_SIZE (1<<RADIX_BITS) #define BITMASK (HISTO_SIZE-1) #define WORK_GROUP_SZ 64 #define SORTITEM_T uint // this kernel runs fine and has the correct results on CPU and GPU __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SZ,1,1))) void sort_analysis( __global SORTITEM_T* input, __global uint* histo, uint const SORTLEN, uint const SHIFT_BITS, uint const LOCAL_WORK_LEN ) { size_t const local_sz = get_local_size(0), global_sz = get_global_size(0), grid = get_group_id(0) , num_groups = get_num_groups(0) , gid = get_global_id(0) , lid = get_local_id(0) , div_sz = SORTLEN / global_sz , mod_sz = SORTLEN % global_sz , part_beg = (gid*div_sz)+ min(mod_sz,gid) , part_end = ((gid+1)*div_sz)+ min(mod_sz,gid+1) ; __private uint phisto[ HISTO_SIZE ]; // init local histogram for( size_t i=0; i<HISTO_SIZE; ++i ){ phisto[ i ] = 0; } // create workitemn window histo size_t ri; for( ri=part_beg; ri<part_end; ++ri ){ size_t const key = (as_uint( input[ ri ] )>>SHIFT_BITS) & BITMASK; phisto[ key ]++; } for( size_t i=0; i<HISTO_SIZE; ++i ){ histo[ gid*HISTO_SIZE + i ] = phisto; } } // version of the Kernel runs fine on CPU but freezes on GPU __kernel __attribute__((reqd_work_group_size(WORK_GROUP_SZ,1,1))) void sort_analysis( __global SORTITEM_T* input, __global uint* histo, uint const SORTLEN, uint const SHIFT_BITS, uint const LOCAL_WORK_LEN ) { size_t const local_sz = get_local_size(0), global_sz = get_global_size(0), grid = get_group_id(0) , num_groups = get_num_groups(0) , gid = get_global_id(0) , lid = get_local_id(0) , div_sz = SORTLEN / global_sz , mod_sz = SORTLEN % global_sz , part_beg = (gid*div_sz)+ min(mod_sz,gid) , part_end = ((gid+1)*div_sz)+ min(mod_sz,gid+1) ; __private uint phisto[ HISTO_SIZE ]; // init local histogram for( size_t i=0; i<HISTO_SIZE; ++i ){ phisto[ i ] = 0; } // create workitemn window histo size_t ri; for( ri=part_beg; ri<part_end-3; ri+=4 ) { uint4 v4 = vload4( 0, input + ri ); uint4 key4 = (v4>>SHIFT_BITS) & BITMASK; phisto[ key4.x ]++; phisto[ key4.y ]++; phisto[ key4.z ]++; phisto[ key4.w ]++; } for( ; ri<part_end; ++ri ){ size_t const key = (as_uint( input[ ri ] )>>SHIFT_BITS) & BITMASK; phisto[ key ]++; } for( size_t i=0; i<HISTO_SIZE; ++i ){ histo[ gid*HISTO_SIZE + i ] = phisto; } }