0 Replies Latest reply on Dec 8, 2008 6:16 AM by alexey.morozov

    CAL bug report

    alexey.morozov
      Kernel execution failure with sequential run using global buffer

      Dear AMD developers,

      here is a small program which uses a simple kernel writing to the global buffer. The program  runs the kernel two times. Both runs use full sequence of initialization, allocation and deallocation of GPU resources (except of closing the device between the runs). For some reason the second run fails. It looks like the kernel does not run at all. This you can see from the program output: the second run gives all zeros (tested on HD4870 X2 in Vista and XP 32 bits with SDK v.1.2.1 beta)


      The use of global buffer is the key in this case. If doing the same with generic output - everything works fine.

      Please test the program on your new SDK release. I hope the bug is not there.

      Best regards,

      Alexey Morozov


      ///////////////////////////////////////////////////////////////////
      //! Header files
      ///////////////////////////////////////////////////////////////////
      #include "cal.h"
      #include "calcl.h"
      #include

      // Dummy IL Kernel writing data to the global buffer (a part of my MatMul kernel)
      std::string kernelIL =
      "il_ps_2_0\n"
      "dcl_input_position_interp(linear_noperspective) vWinCoord0.xy__\n"
      "dcl_cb cb0[1]\n"    // [A.width, C.pitch]
      "dcl_resource_id(0)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
      "dcl_resource_id(1)_type(2d,unnorm)_fmtx(float)_fmty(float)_fmtz(float)_fmtw(float)\n"
      "dcl_literal l0, 4.0f, 4.0f, 4.0f, 4.0f\n"

      // convert [x,y] index to the linear form
      "flr r0.xy, vWinCoord0.xy\n"
      "mul r0.y, r0.y, l0.y\n"
      "ftoi r0, r0\n"
      "ftoi r3, cb0[0]\n"
      "imad r0.x, r0.y, r3.y, r0.x\n"    // index := y*pitch + x - index with account of the alignment pitch

      "mov g[r0.x], cb0[0]\n"
      "iadd r0.x, r0.x, r3.y\n"        // index := index + pitch

      "mov g[r0.x], cb0[0]\n"
      "iadd r0.x, r0.x, r3.y\n"        // index := index + pitch

      "mov g[r0.x], cb0[0]\n"
      "iadd r0.x, r0.x, r3.y\n"        // index := index + pitch

      "mov g[r0.x], cb0[0]\n"

      "end\n";


      int _tmain(int argc, _TCHAR* argv[])
      {
          CALresult err;

          // Initializing CAL
          calInit();

          //--------------------------------------------------------------
          // Querying and opening device
          //--------------------------------------------------------------
          // Finding number of devices
          CALuint numDevices = 0;
          err = calDeviceGetCount(&numDevices);
          // Opening device
          CALdevice device = 0;
          err = calDeviceOpen(&device, 0);
          // Querying device info
          CALdeviceinfo info;
          err = calDeviceGetInfo(&info, 0);

          // Creating context w.r.t. to opened device
          CALcontext ctx = 0;
          err = calCtxCreate(&ctx, device);

          //-----------------------------------------------------------------
          // Compiling Device Kernel
          //-----------------------------------------------------------------
          CALobject obj = NULL;
          CALimage image = NULL;
          CALlanguage lang = CAL_LANGUAGE_IL;
          std::string kernel = kernelIL;
          std::string kernelType = "IL";
          if (calclCompile(&obj, lang, kernel.c_str(), info.target) !=
              CAL_RESULT_OK)
          {
              fprintf(stdout, "Kernel compilation failed. Exiting.\n");
              return 1;
          }
          if (calclLink(&image, &obj, 1) != CAL_RESULT_OK)
          {
              fprintf(stdout, "Kernel linking failed. Exiting.\n");
              return 1;
          }

          //-------------------------------------------------------------------------
          // Allocating and initializing resources
          //-------------------------------------------------------------------------
          // Input and output resources
          CALresource inputRes1 = 0;
          CALresource inputRes2 = 0;
          CALresource outputRes = 0;
          err = calResAllocLocal2D(&inputRes1, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
          err = calResAllocLocal2D(&inputRes2, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
          err = calResAllocLocal2D(&outputRes, device, 256/4, 256, CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER);
          // Constant resource
          CALresource constRes = 0;
          err = calResAllocRemote1D(&constRes, &device, 1, 1, CAL_FORMAT_FLOAT_4, 0);
          // Setup input buffer – map resource to CPU, initialize values, unmap resource
          float* fdata = NULL;
          CALuint pitch = 0;
          CALmem inputMem1 = 0;
          CALmem inputMem2 = 0;

          // Mapping resource to CPU
          err = calResMap((CALvoid**)&fdata, &pitch, inputRes1, 0);
          pitch *= 4;
          for (int i = 0; i < 256; ++i)
          {
              float* tmp = &fdata[i * pitch];
              for (int j = 0; j < 256; ++j)
              {
                  tmp[j] = (float)(i * pitch + j);
              }
          }
          calResUnmap(inputRes1);

          err = calResMap((CALvoid**)&fdata, &pitch, inputRes2, 0);
          pitch *= 4;
          for (int i = 0; i < 256; ++i)
          {
              float* tmp = &fdata[i * pitch];
              for (int j = 0; j < 256; ++j)
              {
                  tmp[j] = (float)(i * pitch + j);
              }
          }
          calResUnmap(inputRes2);


          // Setup const buffer – map resource to CPU, initialize values, unmap resource
          float* constPtr = NULL;
          CALuint constPitch = 0;
          CALmem constMem = 0;

          err = calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0);
          constPtr[0] = 256.0f/4.0f, constPtr[1] = 256.0f/4.0f;
          constPtr[2] = 0.0f; constPtr[3] = 0.0f;
          calResUnmap(constRes);

          // Mapping output resource to CPU and initializing values
          void* data = NULL;
          // Getting memory handle from resources
          CALmem gbufMem = 0;
          err = calResMap(&data, &pitch, outputRes, 0);
          memset(data, 0, pitch * 256 * sizeof(float));
          calResUnmap(outputRes);

          // Get memory handles for various resources
          err = calCtxGetMem(&constMem, ctx, constRes);  
          err = calCtxGetMem(&inputMem1, ctx, inputRes1);
          err = calCtxGetMem(&inputMem2, ctx, inputRes2);
          err = calCtxGetMem(&gbufMem, ctx, outputRes);

          //-----------------------------------------------------------------
          // Loading module and setting domain
          //-----------------------------------------------------------------
          // Creating module using compiled image
          CALmodule module = 0;
          err = calModuleLoad(&module, ctx, image);
          // Defining symbols in module
          CALfunc func = 0;
          CALname inName1 = 0, inName2 = 0, constName = 0, gbufName = 0;
          // Defining entry point for the module
          err = calModuleGetEntry(&func, ctx, module, "main");
          err = calModuleGetName(&inName1, ctx, module, "i0");
          err = calModuleGetName(&inName2, ctx, module, "i1");
          err = calModuleGetName(&gbufName, ctx, module, "g[]");
          err = calModuleGetName(&constName, ctx, module, "cb0");  
          // Setting input and output buffers
          // used in the kernel
          err = calCtxSetMem(ctx, inName1, inputMem1);
          err = calCtxSetMem(ctx, inName2, inputMem2);
          err = calCtxSetMem(ctx, gbufName, gbufMem);
          err = calCtxSetMem(ctx, constName, constMem);
          // Setting domain
          CALdomain domain = {0, 0, 256/4, 256};

          //-----------------------------------------------------------------
          // Executing kernel and waiting for kernel to terminate
          //-----------------------------------------------------------------
          // Event to check completion of the kernel
          CALevent e = 0;
          err = calCtxRunProgram(&e, ctx, func, &domain);
          // Checking whether the execution of the kernel is complete or not
          while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);

          // Reading output from output resources
          err = calResMap((CALvoid**)&fdata, &pitch, outputRes, 0);
          for (int i = 0; i < 8; ++i)
          {
              float* tmp = &fdata[i * pitch];
              for(int j = 0; j < 8; ++j)
              {
                  printf("%f ", tmp[j]);
              }
              printf("\n");
          }
          calResUnmap(outputRes);

          //-----------------------------------------------------------------
          // Cleaning up
          //-----------------------------------------------------------------
          // Unloading the module
          calModuleUnload(ctx, module);
          // Freeing compiled kernel binary
          calclFreeImage(image);
          calclFreeObject(obj);
          // Releasing resource from context
          calCtxReleaseMem(ctx, inputMem1);
          calCtxReleaseMem(ctx, inputMem2);
          calCtxReleaseMem(ctx, constMem);
          calCtxReleaseMem(ctx, gbufMem);
          // Deallocating resources
          calResFree(outputRes);
          calResFree(constRes);
          calResFree(inputRes1);
          calResFree(inputRes2);
          // Destroying context
          calCtxDestroy(ctx);

      /******************************************
       SECOND RETRY WITHOUT CLOSING THE DEVICE!
      ******************************************/

          // Creating context w.r.t. to opened device
          ctx = 0;
          err = calCtxCreate(&ctx, device);

          //-----------------------------------------------------------------
          // Compiling Device Kernel
          //-----------------------------------------------------------------
          obj = NULL;
          image = NULL;
          lang = CAL_LANGUAGE_IL;
          kernel = kernelIL;
          kernelType = "IL";
          if (calclCompile(&obj, lang, kernel.c_str(), info.target) !=
              CAL_RESULT_OK)
          {
              fprintf(stdout, "Kernel compilation failed. Exiting.\n");
              return 1;
          }
          if (calclLink(&image, &obj, 1) != CAL_RESULT_OK)
          {
              fprintf(stdout, "Kernel linking failed. Exiting.\n");
              return 1;
          }

          //-------------------------------------------------------------------------
          // Allocating and initializing resources
          //-------------------------------------------------------------------------
          // Input and output resources
          inputRes1 = 0;
          inputRes2 = 0;
          outputRes = 0;
          err = calResAllocLocal2D(&inputRes1, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
          err = calResAllocLocal2D(&inputRes2, device, 256/4, 256, CAL_FORMAT_FLOAT_4, 0);
          err = calResAllocLocal2D(&outputRes, device, 256/4, 256, CAL_FORMAT_FLOAT_4, CAL_RESALLOC_GLOBAL_BUFFER);
          // Constant resource
          constRes = 0;
          err = calResAllocRemote1D(&constRes, &device, 1, 1, CAL_FORMAT_FLOAT_4, 0);
          // Setup input buffer – map resource to CPU, initialize values, unmap resource
          fdata = NULL;
          pitch = 0;
          inputMem1 = 0;
          inputMem2 = 0;

          // Mapping resource to CPU
          err = calResMap((CALvoid**)&fdata, &pitch, inputRes1, 0);
          pitch *= 4;
          for (int i = 0; i < 256; ++i)
          {
              float* tmp = &fdata[i * pitch];
              for (int j = 0; j < 256; ++j)
              {
                  tmp[j] = (float)(i * pitch + j);
              }
          }
          calResUnmap(inputRes1);

          err = calResMap((CALvoid**)&fdata, &pitch, inputRes2, 0);
          pitch *= 4;
          for (int i = 0; i < 256; ++i)
          {
              float* tmp = &fdata[i * pitch];
              for (int j = 0; j < 256; ++j)
              {
                  tmp[j] = (float)(i * pitch + j);
              }
          }
          calResUnmap(inputRes2);


          // Setup const buffer – map resource to CPU, initialize values, unmap resource
          constPtr = NULL;
          constPitch = 0;
          constMem = 0;

          err = calResMap((CALvoid**)&constPtr, &constPitch, constRes, 0);
          constPtr[0] = 256.0f/4.0f, constPtr[1] = 256.0f/4.0f;
          constPtr[2] = 0.0f; constPtr[3] = 0.0f;
          calResUnmap(constRes);

          // Mapping output resource to CPU and initializing values
          data = NULL;
          // Getting memory handle from resources
          gbufMem = 0;
          err = calResMap(&data, &pitch, outputRes, 0);
          memset(data, 0, pitch * 256 * sizeof(float));
          calResUnmap(outputRes);

          // Get memory handles for various resources
          err = calCtxGetMem(&constMem, ctx, constRes);  
          err = calCtxGetMem(&inputMem1, ctx, inputRes1);
          err = calCtxGetMem(&inputMem2, ctx, inputRes2);
          err = calCtxGetMem(&gbufMem, ctx, outputRes);

          //-----------------------------------------------------------------
          // Loading module and setting domain
          //-----------------------------------------------------------------
          // Creating module using compiled image
          module = 0;
          err = calModuleLoad(&module, ctx, image);
          // Defining symbols in module
          func = 0;
          inName1 = 0; inName2 = 0; constName = 0; gbufName = 0;
          // Defining entry point for the module
          err = calModuleGetEntry(&func, ctx, module, "main");
          err = calModuleGetName(&inName1, ctx, module, "i0");
          err = calModuleGetName(&inName2, ctx, module, "i1");
          err = calModuleGetName(&gbufName, ctx, module, "g[]");
          err = calModuleGetName(&constName, ctx, module, "cb0");  
          // Setting input and output buffers
          // used in the kernel
          err = calCtxSetMem(ctx, inName1, inputMem1);
          err = calCtxSetMem(ctx, inName2, inputMem2);
          err = calCtxSetMem(ctx, gbufName, gbufMem);
          err = calCtxSetMem(ctx, constName, constMem);

          // Setting domain
          // kept to be the same {0, 0, 256/4, 256}

          //-----------------------------------------------------------------
          // Executing kernel and waiting for kernel to terminate
          //-----------------------------------------------------------------
          // Event to check completion of the kernel
          e = 0;
          err = calCtxRunProgram(&e, ctx, func, &domain);
          // Checking whether the execution of the kernel is complete or not
          while (calCtxIsEventDone(ctx, e) == CAL_RESULT_PENDING);

          printf("Second retry:\n");

          // Reading output from output resources
          err = calResMap((CALvoid**)&fdata, &pitch, outputRes, 0);
          for (int i = 0; i < 8; ++i)
          {
              float* tmp = &fdata[i * pitch];
              for(int j = 0; j < 8; ++j)
              {
                  printf("%f ", tmp[j]);
              }
              printf("\n");
          }
          calResUnmap(outputRes);

          //-----------------------------------------------------------------
          // Cleaning up
          //-----------------------------------------------------------------
          // Unloading the module
          calModuleUnload(ctx, module);
          // Freeing compiled kernel binary
          calclFreeImage(image);
          calclFreeObject(obj);
          // Releasing resource from context
          calCtxReleaseMem(ctx, inputMem1);
          calCtxReleaseMem(ctx, inputMem2);
          calCtxReleaseMem(ctx, constMem);
          calCtxReleaseMem(ctx, gbufMem);
          // Deallocating resources
          calResFree(outputRes);
          calResFree(constRes);
          calResFree(inputRes1);
          calResFree(inputRes2);
          // Destroying context
          calCtxDestroy(ctx);


          // Closing device
          calDeviceClose(device);
          // Shutting down CAL
          calShutdown();

          getchar();

          return 0;

      }