Coalescing : memory access pattern that occurs when that the memory requests generated by each thread are in the same GPU cache line. So when each thread reads a vec4 sequentially from global memory that access pattern is coalesced.
You can compare the performance between these 2 kernels –
__kernel void read(__global float *input,__global float *output)
{
float4 val = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
uint index = get_global_id(0);;
val = val + input[index + 0];
val = val + input[index + 1];
val = val + input[index + 2];
val = val + input[index + 3];
val = val + input[index + 4];
val = val + input[index + 5];
val = val + input[index + 6];
val = val + input[index + 7];
val = val + input[index + 8];
val = val + input[index + 9];
val = val + input[index + 10];
val = val + input[index + 11];
val = val + input[index + 12];
val = val + input[index + 13];
val = val + input[index + 14];
val = val + input[index + 15];
output[get_global_id(0);] = val;
}
And
__kernel void read(__global float4 *input,__global float4 *output)
{
float4 val = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
uint index = get_global_id(0);;
val = val + input[index + 0];
val = val + input[index + 1];
val = val + input[index + 2];
val = val + input[index + 3];
val = val + input[index + 4];
val = val + input[index + 5];
val = val + input[index + 6];
val = val + input[index + 7];
val = val + input[index + 8];
val = val + input[index + 9];
val = val + input[index + 10];
val = val + input[index + 11];
val = val + input[index + 12];
val = val + input[index + 13];
val = val + input[index + 14];
val = val + input[index + 15];
output[get_global_id(0);] = val;
}
Note the difference between float and float4 in arguments.
The 2nd kernel is using coalesce accesses and you will get a good bandwidth using this pattern.