Archives Discussions

ryta1203 · ‎04-08-2010

The uint2 kernel has no loads/stores inside loop.

The uint4 does, as a result the uint4 is much slower, any idea?

They are the same kernel, just with uint2 changed to uint4, also, the work-item is different, 128 for uint2 and 64 for uint4, though that shouldn't matter since I'm talking about the produced ISA here.

#define WORK_ITEMS 64 __kernel void nqueen1_vec(int board_size, int level, int threads, __global uint4* params, __global uint4* results) { int idx = get_global_id(0); int tid = get_local_id(0); uint4 ms; __local uint nsx[12][WORK_ITEMS]; __local uint nsy[12][WORK_ITEMS]; __local uint nsz[12][WORK_ITEMS]; __local uint nsw[12][WORK_ITEMS]; uint4 mask = params[idx]; uint4 left_mask = params[idx + threads]; uint4 right_mask = params[idx + threads * 2]; int4 second_row = convert_int4(params[idx + threads * 3]); uint4 board_mask = (uint4) ((1 << board_size) - 1); uint4 left_mask_big = (uint4) 0; uint4 right_mask_big = (uint4) 0; uint4 solutions = (uint4) 0; int4 i = (int4) 0; uint4 nsi, nsi_mask; ms = mask | left_mask | right_mask | (convert_uint4(i < second_row) & (uint4)2); nsi = ((ms + (uint4) 1) & ~ms); nsx[0][tid] = nsi.x; nsy[0][tid] = nsi.y; nsz[0][tid] = nsi.z; nsw[0][tid] = nsi.w; while(any(i >= (int4) 0)) { nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; nsi.z = nsz[max(i.z, 0)][tid]; nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = convert_uint4((nsi & board_mask) != (uint4) 0) & convert_uint4(i >= (int4) 0); { // for nsi_mask == true... mask |= (nsi & nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big << (uint4) 1) | (left_mask >> (uint4) 31), nsi_mask); left_mask = select(left_mask, (left_mask | nsi) << (uint4) 1, nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big >> (uint4) 1) | (right_mask << (uint4) 31), nsi_mask); right_mask = select(right_mask, ((right_mask | nsi) >> (uint4) 1), nsi_mask); ms = mask | left_mask | right_mask | (convert_uint4((i + 1) < second_row) & (uint4)2); nsi = select(nsi, ((ms + (uint4) 1) & ~ms), nsi_mask); i = select(i, i + 1, convert_int4(nsi_mask)); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; nsz[max(i.z, 0)][tid] = nsi.z; nsw[max(i.w, 0)][tid] = nsi.w; } { // for nsi_mask == false solutions -= (convert_uint4(i == (int4) level) & ~nsi_mask); i = select(i - 1, i, convert_int4(nsi_mask)); nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; nsi.z = nsz[max(i.z, 0)][tid]; nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = ~nsi_mask & convert_uint4(i >= (int4) 0); // for i >= 0 mask = select(mask, mask & ~nsi, nsi_mask); left_mask = select(left_mask, (((left_mask >> (uint4) 1) | (left_mask_big << (uint4) 31)) & ~nsi), nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big >> (uint4) 1), nsi_mask); right_mask = select(right_mask, (((right_mask << (uint4) 1) | (right_mask_big >> (uint4) 31)) & ~nsi), nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big << (uint4) 1), nsi_mask); ms = mask | left_mask | right_mask | nsi | (convert_uint4(i < second_row) & (uint4)2); nsi = select(nsi, ((ms + nsi) & ~ms), nsi_mask); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; nsz[max(i.z, 0)][tid] = nsi.z; nsw[max(i.w, 0)][tid] = nsi.w; } } results[idx] = solutions * (uint4) 8; results[idx + threads] = solutions; } __kernel void nqueen1_vec(int board_size, int level, int threads, __global uint2* params, __global uint2* results) { int idx = get_global_id(0); int tid = get_local_id(0); uint2 ms; __local uint nsx[12][WORK_ITEMS]; __local uint nsy[12][WORK_ITEMS]; // __local uint nsz[12][WORK_ITEMS]; // __local uint nsw[12][WORK_ITEMS]; uint2 mask = params[idx]; uint2 left_mask = params[idx + threads]; uint2 right_mask = params[idx + threads * 2]; int2 second_row = convert_int2(params[idx + threads * 3]); uint2 board_mask = (uint2) ((1 << board_size) - 1); uint2 left_mask_big = (uint2) 0; uint2 right_mask_big = (uint2) 0; uint2 solutions = (uint2) 0; int2 i = (int2) 0; uint2 nsi, nsi_mask; ms = mask | left_mask | right_mask | (convert_uint2(i < second_row) & (uint2)2); nsi = ((ms + (uint2) 1) & ~ms); nsx[0][tid] = nsi.x; nsy[0][tid] = nsi.y; // nsz[0][tid] = nsi.z; // nsw[0][tid] = nsi.w; while(any(i >= (int2) 0)) { nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; // nsi.z = nsz[max(i.z, 0)][tid]; // nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = convert_uint2((nsi & board_mask) != (uint2) 0) & convert_uint2(i >= (int2) 0); { // for nsi_mask == true... mask |= (nsi & nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big << (uint2) 1) | (left_mask >> (uint2) 31), nsi_mask); left_mask = select(left_mask, (left_mask | nsi) << (uint2) 1, nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big >> (uint2) 1) | (right_mask << (uint2) 31), nsi_mask); right_mask = select(right_mask, ((right_mask | nsi) >> (uint2) 1), nsi_mask); ms = mask | left_mask | right_mask | (convert_uint2((i + 1) < second_row) & (uint2)2); nsi = select(nsi, ((ms + (uint2) 1) & ~ms), nsi_mask); i = select(i, i + 1, convert_int2(nsi_mask)); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; // nsz[max(i.z, 0)][tid] = nsi.z; // nsw[max(i.w, 0)][tid] = nsi.w; } { // for nsi_mask == false solutions -= (convert_uint2(i == (int2) level) & ~nsi_mask); i = select(i - 1, i, convert_int2(nsi_mask)); nsi.x = nsx[max(i.x, 0)][tid]; nsi.y = nsy[max(i.y, 0)][tid]; // nsi.z = nsz[max(i.z, 0)][tid]; // nsi.w = nsw[max(i.w, 0)][tid]; nsi_mask = ~nsi_mask & convert_uint2(i >= (int2) 0); // for i >= 0 mask = select(mask, mask & ~nsi, nsi_mask); left_mask = select(left_mask, (((left_mask >> (uint2) 1) | (left_mask_big << (uint2) 31)) & ~nsi), nsi_mask); left_mask_big = select(left_mask_big, (left_mask_big >> (uint2) 1), nsi_mask); right_mask = select(right_mask, (((right_mask << (uint2) 1) | (right_mask_big >> (uint2) 31)) & ~nsi), nsi_mask); right_mask_big = select(right_mask_big, (right_mask_big << (uint2) 1), nsi_mask); ms = mask | left_mask | right_mask | nsi | (convert_uint2(i < second_row) & (uint2)2); nsi = select(nsi, ((ms + nsi) & ~ms), nsi_mask); nsx[max(i.x, 0)][tid] = nsi.x; nsy[max(i.y, 0)][tid] = nsi.y; // nsz[max(i.z, 0)][tid] = nsi.z; // nsw[max(i.w, 0)][tid] = nsi.w; } } results[idx] = solutions * (uint2) 8; results[idx + threads] = solutions; }

MicahVillmow · ‎04-08-2010

Ryta,
This might be an issue with SDK 2.01 only, using an internal compiler the only difference between the two kernels is the uint4 has more computation(119 vs 68), a few more registers(24 vs 17) and more control flow(19 vs 13) because of the any() call on int4. Most likely our optimization infrastructure was storing some information on the stack with the int4 kernel and not on the int2 kernel to keep register pressure down. In the newer release we optimize this better so that the stack accesses are removed in many more cases.

ryta1203 · ‎04-08-2010

Micah,

This is confusing to me since I was under the impression that ~30 GPR was not really that big a deal. Are you saying that essentially you are spilling the registers?

Also, where/when can I get the new release?

ryta1203 · ‎04-08-2010

One more question, is it not possible to use vectors in local memory?

MicahVillmow · ‎04-08-2010

Ryta,
The release is coming, but I can't give out exact dates. As for register spilling, there are multiple stages where we decide when to spill to memory in our compiler chain. The problem is that one stage of our compiler chain was deciding that spilling was necessary, but the code that was being generated was in such a fashion that the SC compiler could not remove the spills in 2.01. In our upcoming release this is fixed and spilling will only occur if register pressure at the ISA level is to high.

LDS on 5XXX series of cards can write up to max 64bits per clock per thread. This is different than 4XXX series which wrote up to 128 bits but required use of the texture unit.

ryta1203 · ‎04-08-2010

Micah,

Thanks, so I'm going to take that as a YES, the stack issue is caused by register spilling.

As far as the LM question, for instance, I get an error when trying to do this:

__local uint4 nsx[12][WORK_ITEMS];

nsi.x = nsx.x[max(i.x, 0)][tid];

It gives me a "structure needed" error!??

MicahVillmow · ‎04-08-2010

Ahh, two dimensional arrays are not supported for local memory.

ryta1203 · ‎04-08-2010

So why does this work when it's just uint?

See the kernels I posted earlier that use local memory.

ryta1203 · ‎04-08-2010

Originally posted by: MicahVillmow Ahh, two dimensional arrays are not supported for local memory.

I converted to 1D and still get the same "structure" error for __local uint4.....

MicahVillmow · ‎04-08-2010

Ryta,
That is a bug in 2.01 that is fixed in the upcoming release.

Archives Discussions

Confusing Stores/Loads with local memory