alexey.morozov

CAL bug report

Discussion created by alexey.morozov on Dec 8, 2008
Kernel execution failure with sequential run using global buffer

Dear AMD developers,

here is a small program which uses a simple kernel writing to the global buffer. The program  runs the kernel two times. Both runs use full sequence of initialization, allocation and deallocation of GPU resources (except of closing the device between the runs). For some reason the second run fails. It looks like the kernel does not run at all. This you can see from the program output: the second run gives all zeros (tested on HD4870 X2 in Vista and XP 32 bits with SDK v.1.2.1 beta)


The use of global buffer is the key in this case. If doing the same with generic output - everything works fine.

Please test the program on your new SDK release. I hope the bug is not there.

Best regards,

Alexey Morozov


///////////////////////////////////////////////////////////////////
//! Header files
///////////////////////////////////////////////////////////////////
#include "cal.h"
#include "calcl.h"
#include

// Dummy IL Kernel writing data to the global buffer (a part of my MatMul kernel)
std::string kernelIL =
"il_ps_2_0\n"
"dcl_input_position_interp(linear_noperspective) vWinCoord0.xy__\n"
"dcl_cb cb0[1]\n"    // [A.width, C.pitch]
"dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
"dcl_literal l0, 4.0f, 4.0f, 4.0f, 4.0f\n"

// convert [x,y] index to the linear form
"flr r0.xy, vWinCoord0.xy\n"
"mul r0.y, r0.y, l0.y\n"
"ftoi r0, r0\n"
"ftoi r3, cb0[0]\n"
"imad r0.x, r0.y, r3.y, r0.x\n"    // index := y*pitch + x - index with account of the alignment pitch

"mov g[r0.x], cb0[0]\n"
"iadd r0.x, r0.x, r3.y\n"        // index := index + pitch

"mov g[r0.x], cb0[0]\n"
"iadd r0.x, r0.x, r3.y\n"        // index := index + pitch

"mov g[r0.x], cb0[0]\n"
"iadd r0.x, r0.x, r3.y\n"        // index := index + pitch

"mov g[r0.x], cb0[0]\n"

"end\n";


int _tmain(int argc, _TCHAR* argv[])
{
    CALresult err;

    // Initializing CAL
    calInit();

    //--------------------------------------------------------------
    // Querying and opening device
    //--------------------------------------------------------------
    // Finding number of devices
    CALuint numDevices = 0;
    err = calDeviceGetCount(&numDevices);
    // Opening device
    CALdevice device = 0;
    err = calDeviceOpen(&device, 0);
    // Querying device info
    CALdeviceinfo info;
    err = calDeviceGetInfo(&info, 0);

    // Creating context w.r.t. to opened device
    CALcontext ctx = 0;
    err = calCtxCreate(&ctx, device);

    //-----------------------------------------------------------------
    // Compiling Device Kernel
    //-----------------------------------------------------------------
    CALobject obj = NULL;
    CALimage image = NULL;
    CALlanguage lang = CAL_LANGUAGE_IL;
    std::string kernel = kernelIL;
    std::string kernelType = "IL";
    if (calclCompile(&obj, lang, kernel.c_str(), info.target) !=
        CAL_RESULT_OK)
    {
        fprintf(stdout, "Kernel compilation failed. Exiting.\n");
        return 1;
    }
    if (calclLink(&image, &obj, 1) != CAL_RESULT_OK)
    {
        fprintf(stdout, "Kernel linking failed. Exiting.\n");
        return 1;
    }

    //-------------------------------------------------------------------------
    // Allocating and initializing resources
    //-------------------------------------------------------------------------
    // Input and output resources
    CALresource inputRes1 = 0;
    CALresource inputRes2 = 0;
    CALresource outputRes = 0;
    err = calResAllocLocal2D(&inputRes1, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
    err = calResAllocLocal2D(&inputRes2, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
    err = calResAllocLocal2D(&outputRes, device, 256/4, 256, CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER);
    // Constant resource
    CALresource constRes = 0;
    err = calResAllocRemote1D(&constRes, &device, 1, 1, CAL_FORMAT_FLOAT_4, 0);
    // Setup input buffer – map resource to CPU, initialize values, unmap resource
    float* fdata = NULL;
    CALuint pitch = 0;
    CALmem inputMem1 = 0;
    CALmem inputMem2 = 0;

    // Mapping resource to CPU
    err = calResMap((CALvoid**)&fdata, &pitch, inputRes1, 0);
    pitch *= 4;
    for (int i = 0; i < 256; ++i)
    {
        float* tmp = &fdata[i * pitch];
        for (int j = 0; j < 256; ++j)
        {
            tmp[j] = (float)(i * pitch + j);
        }
    }
    calResUnmap(inputRes1);

    err = calResMap((CALvoid**)&fdata, &pitch, inputRes2, 0);
    pitch *= 4;
    for (int i = 0; i < 256; ++i)
    {
        float* tmp = &fdata[i * pitch];
        for (int j = 0; j < 256; ++j)
        {
            tmp[j] = (float)(i * pitch + j);
        }
    }
    calResUnmap(inputRes2);


    // Setup const buffer – map resource to CPU, initialize values, unmap resource
    float* constPtr = NULL;
    CALuint constPitch = 0;
    CALmem constMem = 0;

    err = calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0);
    constPtr[0] = 256.0f/4.0f, constPtr[1] = 256.0f/4.0f;
    constPtr[2] = 0.0f; constPtr[3] = 0.0f;
    calResUnmap(constRes);

    // Mapping output resource to CPU and initializing values
    void* data = NULL;
    // Getting memory handle from resources
    CALmem gbufMem = 0;
    err = calResMap(&data, &pitch, outputRes, 0);
    memset(data, 0, pitch * 256 * sizeof(float));
    calResUnmap(outputRes);

    // Get memory handles for various resources
    err = calCtxGetMem(&constMem, ctx, constRes);  
    err = calCtxGetMem(&inputMem1, ctx, inputRes1);
    err = calCtxGetMem(&inputMem2, ctx, inputRes2);
    err = calCtxGetMem(&gbufMem, ctx, outputRes);

    //-----------------------------------------------------------------
    // Loading module and setting domain
    //-----------------------------------------------------------------
    // Creating module using compiled image
    CALmodule module = 0;
    err = calModuleLoad(&module, ctx, image);
    // Defining symbols in module
    CALfunc func = 0;
    CALname inName1 = 0, inName2 = 0, constName = 0, gbufName = 0;
    // Defining entry point for the module
    err = calModuleGetEntry(&func, ctx, module, "main");
    err = calModuleGetName(&inName1, ctx, module, "i0");
    err = calModuleGetName(&inName2, ctx, module, "i1");
    err = calModuleGetName(&gbufName, ctx, module, "g[]");
    err = calModuleGetName(&constName, ctx, module, "cb0");  
    // Setting input and output buffers
    // used in the kernel
    err = calCtxSetMem(ctx, inName1, inputMem1);
    err = calCtxSetMem(ctx, inName2, inputMem2);
    err = calCtxSetMem(ctx, gbufName, gbufMem);
    err = calCtxSetMem(ctx, constName, constMem);
    // Setting domain
    CALdomain domain = {0, 0, 256/4, 256};

    //-----------------------------------------------------------------
    // Executing kernel and waiting for kernel to terminate
    //-----------------------------------------------------------------
    // Event to check completion of the kernel
    CALevent e = 0;
    err = calCtxRunProgram(&e, ctx, func, &domain);
    // Checking whether the execution of the kernel is complete or not
    while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);

    // Reading output from output resources
    err = calResMap((CALvoid**)&fdata, &pitch, outputRes, 0);
    for (int i = 0; i < 8; ++i)
    {
        float* tmp = &fdata[i * pitch];
        for(int j = 0; j < 8; ++j)
        {
            printf("%f ", tmp[j]);
        }
        printf("\n");
    }
    calResUnmap(outputRes);

    //-----------------------------------------------------------------
    // Cleaning up
    //-----------------------------------------------------------------
    // Unloading the module
    calModuleUnload(ctx, module);
    // Freeing compiled kernel binary
    calclFreeImage(image);
    calclFreeObject(obj);
    // Releasing resource from context
    calCtxReleaseMem(ctx, inputMem1);
    calCtxReleaseMem(ctx, inputMem2);
    calCtxReleaseMem(ctx, constMem);
    calCtxReleaseMem(ctx, gbufMem);
    // Deallocating resources
    calResFree(outputRes);
    calResFree(constRes);
    calResFree(inputRes1);
    calResFree(inputRes2);
    // Destroying context
    calCtxDestroy(ctx);

/******************************************
 SECOND RETRY WITHOUT CLOSING THE DEVICE!
******************************************/

    // Creating context w.r.t. to opened device
    ctx = 0;
    err = calCtxCreate(&ctx, device);

    //-----------------------------------------------------------------
    // Compiling Device Kernel
    //-----------------------------------------------------------------
    obj = NULL;
    image = NULL;
    lang = CAL_LANGUAGE_IL;
    kernel = kernelIL;
    kernelType = "IL";
    if (calclCompile(&obj, lang, kernel.c_str(), info.target) !=
        CAL_RESULT_OK)
    {
        fprintf(stdout, "Kernel compilation failed. Exiting.\n");
        return 1;
    }
    if (calclLink(&image, &obj, 1) != CAL_RESULT_OK)
    {
        fprintf(stdout, "Kernel linking failed. Exiting.\n");
        return 1;
    }

    //-------------------------------------------------------------------------
    // Allocating and initializing resources
    //-------------------------------------------------------------------------
    // Input and output resources
    inputRes1 = 0;
    inputRes2 = 0;
    outputRes = 0;
    err = calResAllocLocal2D(&inputRes1, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
    err = calResAllocLocal2D(&inputRes2, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
    err = calResAllocLocal2D(&outputRes, device, 256/4, 256, CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER);
    // Constant resource
    constRes = 0;
    err = calResAllocRemote1D(&constRes, &device, 1, 1, CAL_FORMAT_FLOAT_4, 0);
    // Setup input buffer – map resource to CPU, initialize values, unmap resource
    fdata = NULL;
    pitch = 0;
    inputMem1 = 0;
    inputMem2 = 0;

    // Mapping resource to CPU
    err = calResMap((CALvoid**)&fdata, &pitch, inputRes1, 0);
    pitch *= 4;
    for (int i = 0; i < 256; ++i)
    {
        float* tmp = &fdata[i * pitch];
        for (int j = 0; j < 256; ++j)
        {
            tmp[j] = (float)(i * pitch + j);
        }
    }
    calResUnmap(inputRes1);

    err = calResMap((CALvoid**)&fdata, &pitch, inputRes2, 0);
    pitch *= 4;
    for (int i = 0; i < 256; ++i)
    {
        float* tmp = &fdata[i * pitch];
        for (int j = 0; j < 256; ++j)
        {
            tmp[j] = (float)(i * pitch + j);
        }
    }
    calResUnmap(inputRes2);


    // Setup const buffer – map resource to CPU, initialize values, unmap resource
    constPtr = NULL;
    constPitch = 0;
    constMem = 0;

    err = calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0);
    constPtr[0] = 256.0f/4.0f, constPtr[1] = 256.0f/4.0f;
    constPtr[2] = 0.0f; constPtr[3] = 0.0f;
    calResUnmap(constRes);

    // Mapping output resource to CPU and initializing values
    data = NULL;
    // Getting memory handle from resources
    gbufMem = 0;
    err = calResMap(&data, &pitch, outputRes, 0);
    memset(data, 0, pitch * 256 * sizeof(float));
    calResUnmap(outputRes);

    // Get memory handles for various resources
    err = calCtxGetMem(&constMem, ctx, constRes);  
    err = calCtxGetMem(&inputMem1, ctx, inputRes1);
    err = calCtxGetMem(&inputMem2, ctx, inputRes2);
    err = calCtxGetMem(&gbufMem, ctx, outputRes);

    //-----------------------------------------------------------------
    // Loading module and setting domain
    //-----------------------------------------------------------------
    // Creating module using compiled image
    module = 0;
    err = calModuleLoad(&module, ctx, image);
    // Defining symbols in module
    func = 0;
    inName1 = 0; inName2 = 0; constName = 0; gbufName = 0;
    // Defining entry point for the module
    err = calModuleGetEntry(&func, ctx, module, "main");
    err = calModuleGetName(&inName1, ctx, module, "i0");
    err = calModuleGetName(&inName2, ctx, module, "i1");
    err = calModuleGetName(&gbufName, ctx, module, "g[]");
    err = calModuleGetName(&constName, ctx, module, "cb0");  
    // Setting input and output buffers
    // used in the kernel
    err = calCtxSetMem(ctx, inName1, inputMem1);
    err = calCtxSetMem(ctx, inName2, inputMem2);
    err = calCtxSetMem(ctx, gbufName, gbufMem);
    err = calCtxSetMem(ctx, constName, constMem);

    // Setting domain
    // kept to be the same {0, 0, 256/4, 256}

    //-----------------------------------------------------------------
    // Executing kernel and waiting for kernel to terminate
    //-----------------------------------------------------------------
    // Event to check completion of the kernel
    e = 0;
    err = calCtxRunProgram(&e, ctx, func, &domain);
    // Checking whether the execution of the kernel is complete or not
    while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);

    printf("Second retry:\n");

    // Reading output from output resources
    err = calResMap((CALvoid**)&fdata, &pitch, outputRes, 0);
    for (int i = 0; i < 8; ++i)
    {
        float* tmp = &fdata[i * pitch];
        for(int j = 0; j < 8; ++j)
        {
            printf("%f ", tmp[j]);
        }
        printf("\n");
    }
    calResUnmap(outputRes);

    //-----------------------------------------------------------------
    // Cleaning up
    //-----------------------------------------------------------------
    // Unloading the module
    calModuleUnload(ctx, module);
    // Freeing compiled kernel binary
    calclFreeImage(image);
    calclFreeObject(obj);
    // Releasing resource from context
    calCtxReleaseMem(ctx, inputMem1);
    calCtxReleaseMem(ctx, inputMem2);
    calCtxReleaseMem(ctx, constMem);
    calCtxReleaseMem(ctx, gbufMem);
    // Deallocating resources
    calResFree(outputRes);
    calResFree(constRes);
    calResFree(inputRes1);
    calResFree(inputRes2);
    // Destroying context
    calCtxDestroy(ctx);


    // Closing device
    calDeviceClose(device);
    // Shutting down CAL
    calShutdown();

    getchar();

    return 0;

}

Outcomes