10 Replies Latest reply on Feb 20, 2010 4:35 AM by ankurdh

    CL_INVALID_KERNEL_ARGS error

    ankurdh

      i'm getting the CL_INVALID_KERNEL_ARGS error. I'm not understanding what is happening. 

      I've created the context,cmdqueue,kernel object, program object etc correctly. 

      Here is the code. 

      #include<cstdio> #include<cstdlib> #include<iostream> #define __NO_STD_STRING #define __NO_STD_VECTOR #include<SDKUtil/SDKCommon.hpp> #include<SDKUtil/SDKFile.hpp> #include<CL/cl.hpp> using namespace cl; inline void check(cl_int err, char * errMsg,char * successMsg){ if(err != CL_SUCCESS){ std::cout<<errMsg<<": "<<err; std::getchar(); std::exit(-1); }else std::cout<<successMsg<<std::endl; } int main(){ streamsdk::SDKCommon * toolkit = new streamsdk::SDKCommon(); cl::vector<cl::Platform> platform; cl_int err = cl::Platform::get(&platform); check(err,"Platforms could not be queried","Platforms queried successfully"); /* cl::vector<cl::Platform>::iterator platformIterator; for(platformIterator = platform.begin() ; platformIterator != platform.end() ; ++platformIterator){ std::cout<<"Platform Name: "<<(*platformIterator).getInfo<CL_PLATFORM_NAME>().c_str()<<std::endl; std::cout<<"Platform Vendor: "<<(*platformIterator).getInfo<CL_PLATFORM_VENDOR>().c_str()<<std::endl; } */ cl_context_properties cps [3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platform.begin())(), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU,cps,NULL,NULL,&err); check(err,"Context could not be created","Context created successfully\n"); cl::vector<cl::Device> device; device = context.getInfo<CL_CONTEXT_DEVICES>(); /*cl::vector<cl::Device>::iterator deviceIterator; std::cout<<"\nNo of identified devices: "<<device.size()<<std::endl; if(device.size() == 1){ std::cout<<"\nDevice Name: "<<(*device.begin()).getInfo<CL_DEVICE_NAME>().c_str()<<std::endl; std::cout<<"Device Vendor: "<<(*device.begin()).getInfo<CL_DEVICE_VENDOR>().c_str()<<std::endl; }else{ for(deviceIterator = device.begin() ; deviceIterator != device.end() ; deviceIterator++){ std::cout<<"\nDevice Name: "<<(*deviceIterator).getInfo<CL_DEVICE_NAME>().c_str()<<std::endl; std::cout<<"Device Vendor: "<<(*deviceIterator).getInfo<CL_DEVICE_VENDOR>().c_str()<<std::endl; } } */ streamsdk::SDKFile kernelFile; if(!kernelFile.open("matMulKernel.txt")){ std::cout<<"Could not open the kernel file."<<std::endl; std::getchar(); std::exit(-1); } std::cout<<"Kernel file opened."<<std::endl; cl::Program::Sources kernelSrc(1,std::make_pair(kernelFile.source().data(), kernelFile.source().size())); cl::Program program(context,kernelSrc,&err); check(err,"Could not create program","program created successfully"); err = program.build(device); check(err,"Program build failed.","Program build Successful."); Kernel kernel(program,"matrixMultiplication",&err); if(err != CL_SUCCESS){ std::cout<<"Could not create a kernel object.\n"; std::getchar(); std::exit(-4); } std::cout<<"Kernel object created successfully.\n"; CommandQueue queue(context,*(device.begin()),0,&err); if(err != CL_SUCCESS){ std::cout<<"Could not create the command queue.\n"; std::getchar(); std::exit(-4); } std::cout<<"Command queue created successfully.\n"; cl_int * inputA = (int *) malloc (9 * sizeof(cl_int)); toolkit->fillRandom<cl_int>(inputA,3,3,1,1); toolkit->printArray<cl_int>("Array 1",inputA,3,3); cl_int * inputB = (int *) malloc (9 * sizeof(cl_int)); toolkit->fillRandom<cl_int>(inputB,3,3,1,1); toolkit->printArray<cl_int>("Array 2:",inputA,3,3); cl_int * op = (int *) malloc( 9 * sizeof(cl_int)); Buffer input1(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,9*sizeof(cl_int),(void *)inputA,&err); check(err,"Could not create a buffer 1","Buffer 1 created Successfully."); Buffer input2(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,9*sizeof(cl_int),(void *)inputB,&err); check(err,"Could not create a buffer 2","Buffer 2 created Successfully."); Buffer output(context,CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,9*sizeof(cl_int),(void *)op,&err); check(err,"Could not create a buffer 3 ","Buffer 3 created Successfully."); err = kernel.setArg<Buffer>(0,input1); check(err,"Could not set arg 1.","Arg 1 set successfully."); err = kernel.setArg<Buffer>(1,input2); check(err,"Could not set arg 2.","Arg 2 set successfully."); err = kernel.setArg<Buffer>(2,output); check(err,"Could not set arg 3.","Arg 3 set successfully."); err = queue.enqueueNDRangeKernel(kernel,NullRange,NDRange(9),NDRange(1),NULL,NULL); if(err != CL_SUCCESS){ std::cout<<"Could not run the kernel : "<<err<<std::endl; std::getchar(); std::exit(-5); } err = queue.enqueueReadBuffer(output,CL_TRUE,0,9*sizeof(int),(void *)op,NULL,NULL); toolkit->printArray<cl_int>("Output Array: ",op,3,3); std::cout<<"Kernel executed successfully.\n"; std::getchar(); }

        • CL_INVALID_KERNEL_ARGS error
          nou

          what is your kernel?

            • CL_INVALID_KERNEL_ARGS error
              ankurdh

              @nou: i got it. Made some changes in the code. And it worked.

              Another problem is, when i use the above code to multiply large matrices. the performance of CPU is better than the performance of the GPU. My project is to demonstrate how the performance is enhanced by the GPU!! And i'm getting contradictory results. . Please help. I'm attaching the program i ran with the kernel. 

              #include<cstdio> #include<cstdlib> #include<iostream> #define __NO_STD_STRING #define __NO_STD_VECTOR #include<SDKUtil/SDKCommon.hpp> #include<SDKUtil/SDKFile.hpp> #include<CL/cl.hpp> using namespace cl; const int n = 200; inline void check(cl_int err, char * errMsg,char * successMsg){ if(err != CL_SUCCESS){ std::cout<<errMsg<<": "<<err; std::getchar(); std::exit(-1); }else std::cout<<successMsg<<std::endl; } int main(){ streamsdk::SDKCommon * toolkit = new streamsdk::SDKCommon(); int timer = toolkit->createTimer(); toolkit->resetTimer(timer); cl::vector<cl::Platform> platform; cl_int err = cl::Platform::get(&platform); check(err,"Platforms could not be queried","Platforms queried successfully"); cl_context_properties cps [3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(*platform.begin())(), 0 }; cl::Context context(CL_DEVICE_TYPE_GPU,cps,NULL,NULL,&err); check(err,"Context could not be created","Context created successfully\n"); cl::vector<cl::Device> device; device = context.getInfo<CL_CONTEXT_DEVICES>(); streamsdk::SDKFile kernelFile; if(!kernelFile.open("matMulKernel.txt")){ std::cout<<"Could not open the kernel file."<<std::endl; std::getchar(); std::exit(-1); } std::cout<<"Kernel file opened."<<std::endl; cl::Program::Sources kernelSrc(1,std::make_pair(kernelFile.source().data(), kernelFile.source().size())); cl::Program program(context,kernelSrc,&err); check(err,"Could not create program","program created successfully"); err = program.build(device); check(err,"Program build failed.","Program build Successful."); /* 5. Create a kernel object. */ Kernel kernel(program,"matrixMultiplication",&err); if(err != CL_SUCCESS){ std::cout<<"Could not create a kernel object.\n"; std::getchar(); std::exit(-4); } std::cout<<"Kernel object created successfully.\n"; /* 6. Create a queue for the kernels to run. */ CommandQueue queue(context,*(device.begin()),0,&err); if(err != CL_SUCCESS){ std::cout<<"Could not create the command queue.\n"; std::getchar(); std::exit(-4); } std::cout<<"Command queue created successfully.\n"; cl_int * inputA = (int *) malloc (n*n * sizeof(cl_int)); toolkit->fillRandom<cl_int>(inputA,n,n,1,1,0); //toolkit->printArray<cl_int>("Array 1",inputA,n,n); cl_int * inputB = (int *) malloc (n*n * sizeof(cl_int)); toolkit->fillRandom<cl_int>(inputB,n,n,1,1,0); //toolkit->printArray<cl_int>("Array 2:",inputB,n,n); cl_int * op = (int *) malloc( n*n * sizeof(cl_int)); Buffer input1(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,n*n*sizeof(cl_int),(void *)inputA,&err); check(err,"Could not create a buffer 1","Buffer 1 created Successfully."); Buffer input2(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,n*n*sizeof(cl_int),(void *)inputB,&err); check(err,"Could not create a buffer 2","Buffer 2 created Successfully."); Buffer output(context,CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,n*n*sizeof(cl_int),(void *)op,&err); check(err,"Could not create a buffer 3 ","Buffer 3 created Successfully."); Buffer arrWidth(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,sizeof(n),(void *)&n,&err); check(err,"Could not create a buffer 4. ", "Buffer 4 created successfully."); err = queue.enqueueWriteBuffer(arrWidth,CL_TRUE,NULL,sizeof(n),(void *)&n,NULL,NULL); check(err,"Array width could not be written into the buffer.","Array width written successfully."); err = kernel.setArg(0,sizeof(input1),(void *)&input1); check(err,"Could not set arg 1.","Arg 1 set successfully."); err = kernel.setArg(1,sizeof(input2),(void *)&input2); check(err,"Could not set arg 2.","Arg 2 set successfully."); err = kernel.setArg(2,sizeof(output),(void *)&output); check(err,"Could not set arg 3.","Arg 3 set successfully."); err = kernel.setArg(3,sizeof(&arrWidth),(void *)&arrWidth); check(err,"Could not set arg 4.","Arg 4 set successfully."); /* 7. Run the kernel. */ toolkit->startTimer(timer); err = queue.enqueueNDRangeKernel(kernel,NullRange,NDRange(n,n),NDRange(1,1),NULL,NULL); if(err != CL_SUCCESS){ std::cout<<"Could not run the kernel : "<<err<<std::endl; std::getchar(); std::exit(-5); } err = queue.enqueueReadBuffer(output,CL_TRUE,0,n*n*sizeof(int),(void *)op,NULL,NULL); //toolkit->printArray<cl_int>("Output Array: ",op,n,n); toolkit->stopTimer(timer); std::cout<<"Kernel execution time: "<<toolkit->readTimer(timer)<<std::endl; std::cout<<"Kernel executed successfully.\n"; //the following code is for the CPU. cl_int cpuArrA[n][n]; cl_int cpuArrB[n][n]; cl_int cpuOpArr[n][n]; for(int i = 0 ; i < n ; i++){ for(int j = 0 ; j < n; j ++){ cpuArrA[i][j] = 1; cpuArrB[i][j] = 1; cpuOpArr[i][i] = 0; } } std::cout<<"Running on the cpu:"<<std::endl; /*for(int i = 0 ; i < n ; i++){ for(int j = 0 ; j < n; j ++){ std::cout<<" "<<cpuArrA[i][j]; } std::cout<<std::endl; } std::cout<<"\n"; for(int i = 0 ; i < n ; i++){ for(int j = 0 ; j < n; j ++){ std::cout<<" "<<cpuArrB[i][j]; } std::cout<<std::endl; }*/ toolkit->resetTimer(timer); toolkit->startTimer(timer); for(int i = 0 ; i < n ; i ++){ for(int j = 0 ; j < n ; j ++){ cpuOpArr[i][j] = 0; for(int k = 0 ; k < n ; k ++){ cpuOpArr[i][j] += cpuArrA[i][k] * cpuArrB[k][j]; } } } toolkit->stopTimer(timer); /*for(int i = 0 ; i < n ; i++){ for(int j = 0 ; j < n; j ++){ std::cout<<" "<<cpuOpArr[i][j]; } std::cout<<std::endl; }*/ std::cout<<"Time taken on the cpu: "<<toolkit->readTimer(timer)<<std::endl; std::getchar(); } /* kernel file> > > > _kernel void matrixMultiplication(__global int * array1, __global int * array2, __global int * outputArr, __global int * arrayWidth){ int globalX = get_global_id(0); int globalY = get_global_id(1); int width = *arrayWidth; int sum = 0; //int j = globalY; for(int i = 0; i < width ; i ++) sum += array1[globalY*width+i]*array2[i*width+globalX]; outputArr[globalY*width+globalX] = sum; } */

                • CL_INVALID_KERNEL_ARGS error
                  ankurdh

                  here is the output @n=250

                   

                   

                   

                  Platforms queried successfully

                  Context created successfully

                   

                  Kernel file opened.

                  program created successfully

                  Program build Successful.

                  Kernel object created successfully.

                  Command queue created successfully.

                  Buffer 1 created Successfully.

                  Buffer 2 created Successfully.

                  Buffer 3 created Successfully.

                  Buffer 4 created successfully.

                  Array width written successfully.

                  Arg 1 set successfully.

                  Arg 2 set successfully.

                  Arg 3 set successfully.

                  Arg 4 set successfully.

                  Kernel execution time: 2.96607

                  Kernel executed successfully.

                  Running on the cpu:

                  Time taken on the cpu: 0.191811