My runtime code is written in a python wrapper to brook (using boost:ython) ... I attached the code here
output for the ps kernels is as expected:
... k = kernels.__wftest4() ...
print h_out
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 2, 1, 3, 1, 0, 0],
[0, 0, 1, 2, 2, 2, 3, 2, 0, 0],
[0, 0, 1, 3, 2, 3, 3, 3, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint32)
output for the scatter ps is already unexpected, i.e. it loops from domainOffset to domainSize-domainOffset.
... k = kernels.__wftest3() ...
print h_out
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 1, 2, 1, 0, 0, 0, 0],
[0, 0, 1, 2, 2, 2, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=uint32)
For the cs, domainSize and domainOffset have no effect:
... k = kernels.__wftest2() ...
print h_out
array([[0, 0, 1, 0, 2, 0, 3, 0, 4, 0],
[0, 1, 1, 1, 2, 1, 3, 1, 4, 1],
[0, 2, 1, 2, 2, 2, 3, 2, 4, 2],
[0, 3, 1, 3, 2, 3, 3, 3, 4, 3],
[0, 4, 1, 4, 2, 4, 3, 4, 4, 4]], dtype=uint32)
The boost wrapping of domainOffset and domainSize is a direct automated wrapper of the C++ functions, so there's little room for problems there.
import os os.environ['BRT_ADAPTER']='3' import kernels import stream import numpy def fX(dims,x): b = numpy.array(dims)[::-1] b[-1]*=x return tuple(b) def f4(dims): return fX(dims,4) def f2(dims): return fX(dims,2) dims =(5,5) h_in = numpy.zeros(f2(dims),dtype=numpy.uint32) h_in.flat[:] = numpy.arange(len(h_in.flat)).astype(numpy.uint32) h_out = numpy.zeros(f2(dims),dtype=numpy.uint32) d_in = stream.Stream_uint2(dims) d_out= stream.Stream_uint2(dims) d_in.read(h_in) d_out.read(h_out) k = kernels.__wftest4() k.domainSize(kernels.uint4(3,3,1,1)) k.domainOffset(kernels.uint4(1,1,0,0)) k.run(d_in,d_out) if d_out.error()!=stream.BRerror.BR_NO_ERROR: print "Error:", s1.errorLog() d_out.write(h_out) print h_out ************************** end ******************** Kernel code compiled to python module "kernels" imported above Attribute[GroupSize (64,1,1)] kernel void wftest2(uint2 in_s[][], out uint2 out_s[][]) { uint2 global_id = (uint2)instance ().xy; uint local_id = (uint)instanceInGroup().x; //int4 tmp = {1,2,3,4}; //uint tmp = dot(in_s[global_id.y][local_id],in_s[global_id.y][local_id]); out_s[global_id.y][global_id.x]= uint2(global_id.x,global_id.y); } kernel void wftest3(uint2 in_s[][], out uint2 out_s[][]) { uint2 global_id = (uint2)instance ().xy; //uint local_id = (uint)instanceInGroup().x; //int4 tmp = {1,2,3,4}; //uint tmp = dot(in_s[global_id.y][local_id],in_s[global_id.y][local_id]); out_s[global_id.y][global_id.x]= uint2(global_id.x,global_id.y); } kernel void wftest4(uint2 in_s<>, out uint2 out_s<>) { uint2 global_id = (uint2)instance ().xy; //uint local_id = (uint)instanceInGroup().x; //int4 tmp = {1,2,3,4}; //uint tmp = dot(in_s[global_id.y][local_id],in_s[global_id.y][local_id]); out_s= uint2(global_id.x,global_id.y); }