Archives Discussions

nano23 · ‎01-13-2011

Hi,

i've compared 2 versions of a simple 2D convolution. First the "unoptimized" without const and __global isntead of __constant and the second one with const.

The "optimized" has nearly 2x ALUInsts, but with small filter sizes 3-31 its faster and with a filtersize of 101 its slower. I don't understand this behaviour

The output data size is 1024x1024.

The device is a HD 5850.

Optimized (Size 29)

Method	ExecutionOrder	GlobalWorkSize	GroupWorkSize	Time	LDSSize	DataTransferSize	GPRs	ScratchRegs	FCStacks	Wavefronts	ALUInsts	FetchInsts	WriteInsts	LDSFetchInsts	LDSWriteInsts	ALUBusy	ALUFetchRatio	ALUPacking	FetchSize	CacheHit	FetchUnitBusy	FetchUnitStalled	WriteUnitStalled	FastPath	CompletePath	PathUtilization	ALUStalledByLDS	LDSBankConflict
WriteBuffer	105			2,8396		4323,06
WriteBuffer	106			0,19504		3,29
convolve2D_0558A308	107	{ 1024 1024 1}	{ 256 1 1}	101,06767	0		14	0	3	16384	6918	1682	1	0	0	56,08	4,11	37,21	2123265,56	0,31	30,36	0	0	4095	0	100	0	0
convolve2D_0558A308	108	{ 1024 1024 1}	{ 256 1 1}	101,04606	0		14	0	3	16384	6918	1682	1	0	0	56,06	4,11	37,21	2123265,56	0,3	30,35	0	0	4095	0	100	0	0
convolve2D_0558A308	109	{ 1024 1024 1}	{ 256 1 1}	101,09112	0		14	0	3	16384	6918	1682	1	0	0	56,06	4,11	37,21	2123265,56	0,31	30,35	0	0	4095	0	100	0	0
convolve2D_0558A308	110	{ 1024 1024 1}	{ 256 1 1}	101,09242	0		14	0	3	16384	6918	1682	1	0	0	56,07	4,11	37,21	2123265,56	0,31	30,35	0	0	4095	0	100	0	0
convolve2D_0558A308	111	{ 1024 1024 1}	{ 256 1 1}	101,06491	0		14	0	3	16384	6918	1682	1	0	0	56,06	4,11	37,21	2123265,56	0,3	30,35	0	0	4095	0	100	0	0
ReadBuffer	112			2,50243		4096

Unoptimized (Size 29)

Method	ExecutionOrder	GlobalWorkSize	GroupWorkSize	Time	LDSSize	DataTransferSize	GPRs	ScratchRegs	FCStacks	Wavefronts	ALUInsts	FetchInsts	WriteInsts	LDSFetchInsts	LDSWriteInsts	ALUBusy	ALUFetchRatio	ALUPacking	FetchSize	CacheHit	FetchUnitBusy	FetchUnitStalled	WriteUnitStalled	FastPath	CompletePath	PathUtilization	ALUStalledByLDS	LDSBankConflict
WriteBuffer	105			2,82946		4323,06
WriteBuffer	106			0,17178		3,29
convolve2D_0549A690	107	{ 1024 1024 1}	{ 256 1 1}	120,4313	0		13	0	3	16384	3555	1682	1	0	0	23,05	2,11	34,57	2525696	0	32,23	7,96	0	4095	0	100	0	0
convolve2D_0549A690	108	{ 1024 1024 1}	{ 256 1 1}	127,28908	0		13	0	3	16384	3555	1682	1	0	0	23,35	2,11	34,57	2525696	0	32,22	7,62	0	4095	0	100	0	0
convolve2D_0549A690	109	{ 1024 1024 1}	{ 256 1 1}	127,61811	0		13	0	3	16384	3555	1682	1	0	0	23,45	2,11	34,57	2525696	0	32,14	7,44	0	4095	0	100	0	0
convolve2D_0549A690	110	{ 1024 1024 1}	{ 256 1 1}	125,39515	0		13	0	3	16384	3555	1682	1	0	0	22,68	2,11	34,57	2525696	0	32,05	8,15	0	4095	0	100	0	0
convolve2D_0549A690	111	{ 1024 1024 1}	{ 256 1 1}	125,41467	0		13	0	3	16384	3555	1682	1	0	0	23,29	2,11	34,57	2525696	0	32,17	7,63	0	4095	0	100	0	0
ReadBuffer	112			3,31465		4096

Optimized (Size 101)

Method	ExecutionOrder	GlobalWorkSize	GroupWorkSize	Time	LDSSize	DataTransferSize	GPRs	ScratchRegs	FCStacks	Wavefronts	ALUInsts	FetchInsts	WriteInsts	LDSFetchInsts	LDSWriteInsts	ALUBusy	ALUFetchRatio	ALUPacking	FetchSize	CacheHit	FetchUnitBusy	FetchUnitStalled	WriteUnitStalled	FastPath	CompletePath	PathUtilization	ALUStalledByLDS	LDSBankConflict
WriteBuffer	1			4,46239		4935,06
WriteBuffer	2			0,7072		39,85
convolve2D_00F1A808	3	{ 1024 1024 1}	{ 256 1 1}	1221,3376	0		14	0	3	16384	82230	20402	1	0	0	55,13	4,03	37,42	25778451,9	0,79	30,45	0	0	4095	0	100	0	0
convolve2D_00F1A808	4	{ 1024 1024 1}	{ 256 1 1}	1221,22536	0		14	0	3	16384	82230	20402	1	0	0	55,19	4,03	37,42	25778451,9	0,84	30,49	0	0	4095	0	100	0	0
convolve2D_00F1A808	5	{ 1024 1024 1}	{ 256 1 1}	1221,74639	0		14	0	3	16384	82230	20402	1	0	0	55,17	4,03	37,42	25778451,9	0,83	30,48	0	0	4095	0	100	0	0
convolve2D_00F1A808	6	{ 1024 1024 1}	{ 256 1 1}	1220,52834	0		14	0	3	16384	82230	20402	1	0	0	55,18	4,03	37,42	25778451,9	0,85	30,48	0	0	4095	0	100	0	0
convolve2D_00F1A808	7	{ 1024 1024 1}	{ 256 1 1}	1220,82929	0		14	0	3	16384	82230	20402	1	0	0	55,17	4,03	37,42	25778451,9	0,85	30,48	0	0	4095	0	100	0	0
ReadBuffer	8			1,75211		4096

Unoptimized (Size 101)

Method	ExecutionOrder	GlobalWorkSize	GroupWorkSize	Time	LDSSize	DataTransferSize	GPRs	ScratchRegs	FCStacks	Wavefronts	ALUInsts	FetchInsts	WriteInsts	LDSFetchInsts	LDSWriteInsts	ALUBusy	ALUFetchRatio	ALUPacking	FetchSize	CacheHit	FetchUnitBusy	FetchUnitStalled	WriteUnitStalled	FastPath	CompletePath	PathUtilization	ALUStalledByLDS	LDSBankConflict
WriteBuffer	1			4,28947		4935,06
WriteBuffer	2			2,48555		39,85
convolve2D_00F4A810	3	{ 1024 1024 1}	{ 256 1 1}	1020,59145	0		13	0	3	16384	41427	20402	1	0	0	32,17	2,03	34,88	30981376	0	37,66	2,39	0	4095	0	100	0	0
convolve2D_00F4A810	4	{ 1024 1024 1}	{ 256 1 1}	993,80199	0		13	0	3	16384	41427	20402	1	0	0	33,41	2,03	34,88	30981376	0	38,47	1,83	0	4095	0	100	0	0
convolve2D_00F4A810	5	{ 1024 1024 1}	{ 256 1 1}	976,17333	0		13	0	3	16384	41427	20402	1	0	0	34,27	2,03	34,88	30981376	0	39,08	1,51	0	4095	0	100	0	0
convolve2D_00F4A810	6	{ 1024 1024 1}	{ 256 1 1}	998,36419	0		13	0	3	16384	41427	20402	1	0	0	32,24	2,03	34,88	30981376	0	37,83	2,48	0	4095	0	100	0	0
convolve2D_00F4A810	7	{ 1024 1024 1}	{ 256 1 1}	1038,83055	0		13	0	3	16384	41427	20402	1	0	0	33,57	2,03	34,88	30981376	0	38,45	1,64	0	4095	0	100	0	0
ReadBuffer	8			1,99185		4096

thanks,

rgds

__kernel void convolve2D( __global float *src, __global float *filter, int filterWidth, int imageWidth, __global float *dest){ int destWidth = get_global_size(0), xOut = get_global_id(0), yOut = get_global_id(1); float sum = 0; for (int y = 0; y < filterWidth; ++y){ int idxFY = y*filterWidth; int idxInY = (yOut + y) * imageWidth + xOut; for (int x = 0; x < filterWidth; ++x){ int idxF = idxFY + x; int idxIn = idxInY + x; sum += filter[idxF]*src[idxIn]; } } int idxOut = yOut * destWidth + xOut; dest[idxOut] = sum; } // "optimized" __kernel void convolve2D(const __global float *src, __constant float *filter, const int filterWidth, const int imageWidth, __global float *dest){ const int destWidth = get_global_size(0), xOut = get_global_id(0), yOut = get_global_id(1); float sum = 0; for (int y = 0; y < filterWidth; ++y){ const int idxFY = y*filterWidth; const int idxInY = (yOut + y) * imageWidth + xOut; for (int x = 0; x < filterWidth; ++x){ const int idxF = idxFY + x; const int idxIn = idxInY + x; sum += filter[idxF]*src[idxIn]; } } const int idxOut = yOut * destWidth + xOut; dest[idxOut] = sum; }

MicahVillmow · ‎01-13-2011

const global and constant are different. const global is memory that exists on the device but is read only and uncached between clauses. constant uses hardware constant caches to load memory efficiently from either the kcache when the index is known at compile time, or via the texture cache.

At the larger filter sizes, you are probably trashing the cache causing memory fetches to go out to RAM.

nano23 · ‎01-13-2011

ahh ok, but why are there more alu-instructions, if i make the index variables const?

MicahVillmow · ‎01-24-2011

constant memory accesses are ALU instructions and not memory instructions.

Archives Discussions

Strange behaviour with const and __constant