Dear AMD developers,
here is a small program which uses a simple kernel writing to the global buffer. The program runs the kernel two times. Both runs use full sequence of initialization, allocation and deallocation of GPU resources (except of closing the device between the runs). For some reason the second run fails. It looks like the kernel does not run at all. This you can see from the program output: the second run gives all zeros (tested on HD4870 X2 in Vista and XP 32 bits with SDK v.1.2.1 beta)
The use of global buffer is the key in this case. If doing the same with generic output - everything works fine.
Please test the program on your new SDK release. I hope the bug is not there.
Best regards,
Alexey Morozov
///////////////////////////////////////////////////////////////////
//! Header files
///////////////////////////////////////////////////////////////////
#include "cal.h"
#include "calcl.h"
#include
// Dummy IL Kernel writing data to the global buffer (a part of my MatMul kernel)
std::string kernelIL =
"il_ps_2_0\n"
"dcl_input_position_interp(linear_noperspective) vWinCoord0.xy__\n"
"dcl_cb cb0[1]\n" // [A.width, C.pitch]
"dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_literal l0, 4.0f, 4.0f, 4.0f, 4.0f\n"
// convert [x,y] index to the linear form
"flr r0.xy, vWinCoord0.xy\n"
"mul r0.y, r0.y, l0.y\n"
"ftoi r0, r0\n"
"ftoi r3, cb0[0]\n"
"imad r0.x, r0.y, r3.y, r0.x\n" // index := y*pitch + x - index with account of the alignment pitch
"mov g[r0.x], cb0[0]\n"
"iadd r0.x, r0.x, r3.y\n" // index := index + pitch
"mov g[r0.x], cb0[0]\n"
"iadd r0.x, r0.x, r3.y\n" // index := index + pitch
"mov g[r0.x], cb0[0]\n"
"iadd r0.x, r0.x, r3.y\n" // index := index + pitch
"mov g[r0.x], cb0[0]\n"
"end\n";
int _tmain(int argc, _TCHAR* argv[])
{
CALresult err;
// Initializing CAL
calInit();
//--------------------------------------------------------------
// Querying and opening device
//--------------------------------------------------------------
// Finding number of devices
CALuint numDevices = 0;
err = calDeviceGetCount(&numDevices);
// Opening device
CALdevice device = 0;
err = calDeviceOpen(&device, 0);
// Querying device info
CALdeviceinfo info;
err = calDeviceGetInfo(&info, 0);
// Creating context w.r.t. to opened device
CALcontext ctx = 0;
err = calCtxCreate(&ctx, device);
//-----------------------------------------------------------------
// Compiling Device Kernel
//-----------------------------------------------------------------
CALobject obj = NULL;
CALimage image = NULL;
CALlanguage lang = CAL_LANGUAGE_IL;
std::string kernel = kernelIL;
std::string kernelType = "IL";
if (calclCompile(&obj, lang, kernel.c_str(), info.target) !=
CAL_RESULT_OK)
{
fprintf(stdout, "Kernel compilation failed. Exiting.\n");
return 1;
}
if (calclLink(&image, &obj, 1) != CAL_RESULT_OK)
{
fprintf(stdout, "Kernel linking failed. Exiting.\n");
return 1;
}
//-------------------------------------------------------------------------
// Allocating and initializing resources
//-------------------------------------------------------------------------
// Input and output resources
CALresource inputRes1 = 0;
CALresource inputRes2 = 0;
CALresource outputRes = 0;
err = calResAllocLocal2D(&inputRes1, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
err = calResAllocLocal2D(&inputRes2, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
err = calResAllocLocal2D(&outputRes, device, 256/4, 256, CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER);
// Constant resource
CALresource constRes = 0;
err = calResAllocRemote1D(&constRes, &device, 1, 1, CAL_FORMAT_FLOAT_4, 0);
// Setup input buffer – map resource to CPU, initialize values, unmap resource
float* fdata = NULL;
CALuint pitch = 0;
CALmem inputMem1 = 0;
CALmem inputMem2 = 0;
// Mapping resource to CPU
err = calResMap((CALvoid**)&fdata, &pitch, inputRes1, 0);
pitch *= 4;
for (int i = 0; i < 256; ++i)
{
float* tmp = &fdata[i * pitch];
for (int j = 0; j < 256; ++j)
{
tmp
}
}
calResUnmap(inputRes1);
err = calResMap((CALvoid**)&fdata, &pitch, inputRes2, 0);
pitch *= 4;
for (int i = 0; i < 256; ++i)
{
float* tmp = &fdata[i * pitch];
for (int j = 0; j < 256; ++j)
{
tmp
}
}
calResUnmap(inputRes2);
// Setup const buffer – map resource to CPU, initialize values, unmap resource
float* constPtr = NULL;
CALuint constPitch = 0;
CALmem constMem = 0;
err = calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0);
constPtr[0] = 256.0f/4.0f, constPtr[1] = 256.0f/4.0f;
constPtr[2] = 0.0f; constPtr[3] = 0.0f;
calResUnmap(constRes);
// Mapping output resource to CPU and initializing values
void* data = NULL;
// Getting memory handle from resources
CALmem gbufMem = 0;
err = calResMap(&data, &pitch, outputRes, 0);
memset(data, 0, pitch * 256 * sizeof(float));
calResUnmap(outputRes);
// Get memory handles for various resources
err = calCtxGetMem(&constMem, ctx, constRes);
err = calCtxGetMem(&inputMem1, ctx, inputRes1);
err = calCtxGetMem(&inputMem2, ctx, inputRes2);
err = calCtxGetMem(&gbufMem, ctx, outputRes);
//-----------------------------------------------------------------
// Loading module and setting domain
//-----------------------------------------------------------------
// Creating module using compiled image
CALmodule module = 0;
err = calModuleLoad(&module, ctx, image);
// Defining symbols in module
CALfunc func = 0;
CALname inName1 = 0, inName2 = 0, constName = 0, gbufName = 0;
// Defining entry point for the module
err = calModuleGetEntry(&func, ctx, module, "main");
err = calModuleGetName(&inName1, ctx, module, "i0");
err = calModuleGetName(&inName2, ctx, module, "i1");
err = calModuleGetName(&gbufName, ctx, module, "g[]");
err = calModuleGetName(&constName, ctx, module, "cb0");
// Setting input and output buffers
// used in the kernel
err = calCtxSetMem(ctx, inName1, inputMem1);
err = calCtxSetMem(ctx, inName2, inputMem2);
err = calCtxSetMem(ctx, gbufName, gbufMem);
err = calCtxSetMem(ctx, constName, constMem);
// Setting domain
CALdomain domain = {0, 0, 256/4, 256};
//-----------------------------------------------------------------
// Executing kernel and waiting for kernel to terminate
//-----------------------------------------------------------------
// Event to check completion of the kernel
CALevent e = 0;
err = calCtxRunProgram(&e, ctx, func, &domain);
// Checking whether the execution of the kernel is complete or not
while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);
// Reading output from output resources
err = calResMap((CALvoid**)&fdata, &pitch, outputRes, 0);
for (int i = 0; i < 8; ++i)
{
float* tmp = &fdata[i * pitch];
for(int j = 0; j < 8; ++j)
{
printf("%f ", tmp
}
printf("\n");
}
calResUnmap(outputRes);
//-----------------------------------------------------------------
// Cleaning up
//-----------------------------------------------------------------
// Unloading the module
calModuleUnload(ctx, module);
// Freeing compiled kernel binary
calclFreeImage(image);
calclFreeObject(obj);
// Releasing resource from context
calCtxReleaseMem(ctx, inputMem1);
calCtxReleaseMem(ctx, inputMem2);
calCtxReleaseMem(ctx, constMem);
calCtxReleaseMem(ctx, gbufMem);
// Deallocating resources
calResFree(outputRes);
calResFree(constRes);
calResFree(inputRes1);
calResFree(inputRes2);
// Destroying context
calCtxDestroy(ctx);
/******************************************
SECOND RETRY WITHOUT CLOSING THE DEVICE!
******************************************/
// Creating context w.r.t. to opened device
ctx = 0;
err = calCtxCreate(&ctx, device);
//-----------------------------------------------------------------
// Compiling Device Kernel
//-----------------------------------------------------------------
obj = NULL;
image = NULL;
lang = CAL_LANGUAGE_IL;
kernel = kernelIL;
kernelType = "IL";
if (calclCompile(&obj, lang, kernel.c_str(), info.target) !=
CAL_RESULT_OK)
{
fprintf(stdout, "Kernel compilation failed. Exiting.\n");
return 1;
}
if (calclLink(&image, &obj, 1) != CAL_RESULT_OK)
{
fprintf(stdout, "Kernel linking failed. Exiting.\n");
return 1;
}
//-------------------------------------------------------------------------
// Allocating and initializing resources
//-------------------------------------------------------------------------
// Input and output resources
inputRes1 = 0;
inputRes2 = 0;
outputRes = 0;
err = calResAllocLocal2D(&inputRes1, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
err = calResAllocLocal2D(&inputRes2, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
err = calResAllocLocal2D(&outputRes, device, 256/4, 256, CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER);
// Constant resource
constRes = 0;
err = calResAllocRemote1D(&constRes, &device, 1, 1, CAL_FORMAT_FLOAT_4, 0);
// Setup input buffer – map resource to CPU, initialize values, unmap resource
fdata = NULL;
pitch = 0;
inputMem1 = 0;
inputMem2 = 0;
// Mapping resource to CPU
err = calResMap((CALvoid**)&fdata, &pitch, inputRes1, 0);
pitch *= 4;
for (int i = 0; i < 256; ++i)
{
float* tmp = &fdata[i * pitch];
for (int j = 0; j < 256; ++j)
{
tmp
}
}
calResUnmap(inputRes1);
err = calResMap((CALvoid**)&fdata, &pitch, inputRes2, 0);
pitch *= 4;
for (int i = 0; i < 256; ++i)
{
float* tmp = &fdata[i * pitch];
for (int j = 0; j < 256; ++j)
{
tmp
}
}
calResUnmap(inputRes2);
// Setup const buffer – map resource to CPU, initialize values, unmap resource
constPtr = NULL;
constPitch = 0;
constMem = 0;
err = calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0);
constPtr[0] = 256.0f/4.0f, constPtr[1] = 256.0f/4.0f;
constPtr[2] = 0.0f; constPtr[3] = 0.0f;
calResUnmap(constRes);
// Mapping output resource to CPU and initializing values
data = NULL;
// Getting memory handle from resources
gbufMem = 0;
err = calResMap(&data, &pitch, outputRes, 0);
memset(data, 0, pitch * 256 * sizeof(float));
calResUnmap(outputRes);
// Get memory handles for various resources
err = calCtxGetMem(&constMem, ctx, constRes);
err = calCtxGetMem(&inputMem1, ctx, inputRes1);
err = calCtxGetMem(&inputMem2, ctx, inputRes2);
err = calCtxGetMem(&gbufMem, ctx, outputRes);
//-----------------------------------------------------------------
// Loading module and setting domain
//-----------------------------------------------------------------
// Creating module using compiled image
module = 0;
err = calModuleLoad(&module, ctx, image);
// Defining symbols in module
func = 0;
inName1 = 0; inName2 = 0; constName = 0; gbufName = 0;
// Defining entry point for the module
err = calModuleGetEntry(&func, ctx, module, "main");
err = calModuleGetName(&inName1, ctx, module, "i0");
err = calModuleGetName(&inName2, ctx, module, "i1");
err = calModuleGetName(&gbufName, ctx, module, "g[]");
err = calModuleGetName(&constName, ctx, module, "cb0");
// Setting input and output buffers
// used in the kernel
err = calCtxSetMem(ctx, inName1, inputMem1);
err = calCtxSetMem(ctx, inName2, inputMem2);
err = calCtxSetMem(ctx, gbufName, gbufMem);
err = calCtxSetMem(ctx, constName, constMem);
// Setting domain
// kept to be the same {0, 0, 256/4, 256}
//-----------------------------------------------------------------
// Executing kernel and waiting for kernel to terminate
//-----------------------------------------------------------------
// Event to check completion of the kernel
e = 0;
err = calCtxRunProgram(&e, ctx, func, &domain);
// Checking whether the execution of the kernel is complete or not
while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);
printf("Second retry:\n");
// Reading output from output resources
err = calResMap((CALvoid**)&fdata, &pitch, outputRes, 0);
for (int i = 0; i < 8; ++i)
{
float* tmp = &fdata[i * pitch];
for(int j = 0; j < 8; ++j)
{
printf("%f ", tmp
}
printf("\n");
}
calResUnmap(outputRes);
//-----------------------------------------------------------------
// Cleaning up
//-----------------------------------------------------------------
// Unloading the module
calModuleUnload(ctx, module);
// Freeing compiled kernel binary
calclFreeImage(image);
calclFreeObject(obj);
// Releasing resource from context
calCtxReleaseMem(ctx, inputMem1);
calCtxReleaseMem(ctx, inputMem2);
calCtxReleaseMem(ctx, constMem);
calCtxReleaseMem(ctx, gbufMem);
// Deallocating resources
calResFree(outputRes);
calResFree(constRes);
calResFree(inputRes1);
calResFree(inputRes2);
// Destroying context
calCtxDestroy(ctx);
// Closing device
calDeviceClose(device);
// Shutting down CAL
calShutdown();
getchar();
return 0;
}