The uint2 kernel has no loads/stores inside loop.
The uint4 does, as a result the uint4 is much slower, any idea?
They are the same kernel, just with uint2 changed to uint4, also, the work-item is different, 128 for uint2 and 64 for uint4, though that shouldn't matter since I'm talking about the produced ISA here.
#define WORK_ITEMS 64 __kernel void nqueen1_vec(int board_size, int level, int threads, __global uint4* params, __global uint4* results) { int idx = get_global_id(0); int tid = get_local_id(0); uint4 ms; __local uint nsx[12][WORK_ITEMS]; __local uint nsy[12][WORK_ITEMS]; __local uint nsz[12][WORK_ITEMS]; __local uint nsw[12][WORK_ITEMS]; uint4 mask = params[idx]; uint4 left_mask = params[idx + threads]; uint4 right_mask = params[idx + threads * 2]; int4 second_row = convert_int4(params[idx + threads * 3]); uint4 board_mask = (uint4) ((1 << board_size) - 1); uint4 left_mask_big = (uint4) 0; uint4 right_mask_big = (uint4) 0; uint4 solutions = (uint4) 0; int4 i = (int4) 0; uint4 nsi, nsi_mask; ms = mask | left_mask | right_mask | (convert_uint4(i < second_row) & (uint4)2); nsi = ((ms + (uint4) 1) & ~ms); nsx[0][tid] = nsi.x; nsy[0][tid] = nsi.y; nsz[0][tid] = nsi.z; nsw[0][tid] = nsi.w; while(any(i >= (int4) 0)) { nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; nsi.z = nsz[max(i.z, 0)][tid]; nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = convert_uint4((nsi & board_mask) != (uint4) 0) & convert_uint4(i >= (int4) 0); { // for nsi_mask == true... mask |= (nsi & nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big << (uint4) 1) | (left_mask >> (uint4) 31), nsi_mask); left_mask = select(left_mask, (left_mask | nsi) << (uint4) 1, nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big >> (uint4) 1) | (right_mask << (uint4) 31), nsi_mask); right_mask = select(right_mask, ((right_mask | nsi) >> (uint4) 1), nsi_mask); ms = mask | left_mask | right_mask | (convert_uint4((i + 1) < second_row) & (uint4)2); nsi = select(nsi, ((ms + (uint4) 1) & ~ms), nsi_mask); i = select(i, i + 1, convert_int4(nsi_mask)); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; nsz[max(i.z, 0)][tid] = nsi.z; nsw[max(i.w, 0)][tid] = nsi.w; } { // for nsi_mask == false solutions -= (convert_uint4(i == (int4) level) & ~nsi_mask); i = select(i - 1, i, convert_int4(nsi_mask)); nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; nsi.z = nsz[max(i.z, 0)][tid]; nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = ~nsi_mask & convert_uint4(i >= (int4) 0); // for i >= 0 mask = select(mask, mask & ~nsi, nsi_mask); left_mask = select(left_mask, (((left_mask >> (uint4) 1) | (left_mask_big << (uint4) 31)) & ~nsi), nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big >> (uint4) 1), nsi_mask); right_mask = select(right_mask, (((right_mask << (uint4) 1) | (right_mask_big >> (uint4) 31)) & ~nsi), nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big << (uint4) 1), nsi_mask); ms = mask | left_mask | right_mask | nsi | (convert_uint4(i < second_row) & (uint4)2); nsi = select(nsi, ((ms + nsi) & ~ms), nsi_mask); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; nsz[max(i.z, 0)][tid] = nsi.z; nsw[max(i.w, 0)][tid] = nsi.w; } } results[idx] = solutions * (uint4) 8; results[idx + threads] = solutions; } __kernel void nqueen1_vec(int board_size, int level, int threads, __global uint2* params, __global uint2* results) { int idx = get_global_id(0); int tid = get_local_id(0); uint2 ms; __local uint nsx[12][WORK_ITEMS]; __local uint nsy[12][WORK_ITEMS]; // __local uint nsz[12][WORK_ITEMS]; // __local uint nsw[12][WORK_ITEMS]; uint2 mask = params[idx]; uint2 left_mask = params[idx + threads]; uint2 right_mask = params[idx + threads * 2]; int2 second_row = convert_int2(params[idx + threads * 3]); uint2 board_mask = (uint2) ((1 << board_size) - 1); uint2 left_mask_big = (uint2) 0; uint2 right_mask_big = (uint2) 0; uint2 solutions = (uint2) 0; int2 i = (int2) 0; uint2 nsi, nsi_mask; ms = mask | left_mask | right_mask | (convert_uint2(i < second_row) & (uint2)2); nsi = ((ms + (uint2) 1) & ~ms); nsx[0][tid] = nsi.x; nsy[0][tid] = nsi.y; // nsz[0][tid] = nsi.z; // nsw[0][tid] = nsi.w; while(any(i >= (int2) 0)) { nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; // nsi.z = nsz[max(i.z, 0)][tid]; // nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = convert_uint2((nsi & board_mask) != (uint2) 0) & convert_uint2(i >= (int2) 0); { // for nsi_mask == true... mask |= (nsi & nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big << (uint2) 1) | (left_mask >> (uint2) 31), nsi_mask); left_mask = select(left_mask, (left_mask | nsi) << (uint2) 1, nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big >> (uint2) 1) | (right_mask << (uint2) 31), nsi_mask); right_mask = select(right_mask, ((right_mask | nsi) >> (uint2) 1), nsi_mask); ms = mask | left_mask | right_mask | (convert_uint2((i + 1) < second_row) & (uint2)2); nsi = select(nsi, ((ms + (uint2) 1) & ~ms), nsi_mask); i = select(i, i + 1, convert_int2(nsi_mask)); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; // nsz[max(i.z, 0)][tid] = nsi.z; // nsw[max(i.w, 0)][tid] = nsi.w; } { // for nsi_mask == false solutions -= (convert_uint2(i == (int2) level) & ~nsi_mask); i = select(i - 1, i, convert_int2(nsi_mask)); nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; // nsi.z = nsz[max(i.z, 0)][tid]; // nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = ~nsi_mask & convert_uint2(i >= (int2) 0); // for i >= 0 mask = select(mask, mask & ~nsi, nsi_mask); left_mask = select(left_mask, (((left_mask >> (uint2) 1) | (left_mask_big << (uint2) 31)) & ~nsi), nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big >> (uint2) 1), nsi_mask); right_mask = select(right_mask, (((right_mask << (uint2) 1) | (right_mask_big >> (uint2) 31)) & ~nsi), nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big << (uint2) 1), nsi_mask); ms = mask | left_mask | right_mask | nsi | (convert_uint2(i < second_row) & (uint2)2); nsi = select(nsi, ((ms + nsi) & ~ms), nsi_mask); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; // nsz[max(i.z, 0)][tid] = nsi.z; // nsw[max(i.w, 0)][tid] = nsi.w; } } results[idx] = solutions * (uint2) 8; results[idx + threads] = solutions; }
Micah,
This is confusing to me since I was under the impression that ~30 GPR was not really that big a deal. Are you saying that essentially you are spilling the registers?
Also, where/when can I get the new release?
One more question, is it not possible to use vectors in local memory?
Micah,
Thanks, so I'm going to take that as a YES, the stack issue is caused by register spilling.
As far as the LM question, for instance, I get an error when trying to do this:
__local uint4 nsx[12][WORK_ITEMS];
nsi.x = nsx.x[max(i.x, 0)][tid];
It gives me a "structure needed" error!??
So why does this work when it's just uint?
See the kernels I posted earlier that use local memory.
Originally posted by: MicahVillmow Ahh, two dimensional arrays are not supported for local memory.
I converted to 1D and still get the same "structure" error for __local uint4.....