I'm not able to invoke the kernel from this code, can anybody find my mistake?? The code has no errors. Thanks in advance
//Kernel #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_amd_printf:enable __kernel void mkernel(__global float * values,__global int * row_idx, __global int * col_idx, __global float * vec_out,const int num_nonzeros) { printf("Here "); const int __idx = get_global_id(0); printf("Here "); if (__idx >= num_nonzeros) return; printf("Here "); int i = __idx; vec_out=values+1; } //Host #include "stdafx.h" #include<stdio.h> #include<stdlib.h> #include<malloc.h> #include<CL/cl.h> int main(){ int col = 8; int size=col*sizeof(int); float values[] = {2,5,1,8,9,5,3,4};// int row_idx[] = {0,1,2,2,1,2,3,2};// int col_idx[] = {1,0,0,1,3,2,1,3};// //float in[] = {1,1,1,1}; float out[] = {0,0,0,0,0,0,0,0}; int n_nz = 8; /* 0 5 1 0 2 0 8 3 0 0 5 0 0 9 4 0 */ cl_int error = 0; cl_uint numPlatforms; cl_platform_id* clSelectedPlatformID = NULL; clGetPlatformIDs(0, NULL, &numPlatforms); clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms); error = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL); if(error != CL_SUCCESS) return 0; //Device info cl_uint ciDeviceCount; cl_device_id* clDevices = NULL; error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); if(error != CL_SUCCESS) return 0; clDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * ciDeviceCount); error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, ciDeviceCount, clDevices, &ciDeviceCount); if(error != CL_SUCCESS) return 0; printf("Available Devices: %d.\n",ciDeviceCount); cl_uint device_value = 0; cl_context_properties props[3]; props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM; // indicates that next element is platform props[1] = (cl_context_properties)clSelectedPlatformID[0]; // platform is of type cl_platform_id props[2] = (cl_context_properties)0; // last element must be 0 cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, &error); size_t ParmDataBytes; clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes); cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes); clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL); cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL); cl_mem GPUVal = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*8, values, NULL); cl_mem GPUrow = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*8, row_idx, NULL); cl_mem GPUcol = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*8, col_idx, NULL); //cl_mem GPUin = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*4, values, NULL); cl_mem GPUOut= clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float)*8, NULL, NULL); cl_mem GPUnz = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &n_nz, NULL); //cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * 4, NULL, NULL); FILE *fp; char fileName[] = "spr_ker.cl"; char *source_str; size_t source_size; fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(1000); source_size = fread( source_str, 1, 1000, fp); fclose( fp ); puts(source_str); cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1,(const char **)&source_str,(const size_t *)& source_size, &error); error = clBuildProgram(OpenCLProgram,0, NULL, NULL, NULL, NULL); cl_kernel SparseM = clCreateKernel(OpenCLProgram, "mkernel", NULL); clSetKernelArg(SparseM, 0, sizeof(cl_mem),(void*)&GPUVal); clSetKernelArg(SparseM, 1, sizeof(cl_mem),(void*)&GPUrow); clSetKernelArg(SparseM, 2, sizeof(cl_mem),(void*)&GPUcol); //clSetKernelArg(SparseM, 3, sizeof(cl_mem),(void*)&GPUin); clSetKernelArg(SparseM, 3, sizeof(cl_mem),(void*)&GPUOut); clSetKernelArg(SparseM, 4, sizeof(cl_mem),(void*)&GPUnz); size_t WorkSize[1] = {8}; size_t lWorkSize[1]={1}; cl_int temp = clEnqueueNDRangeKernel(GPUCommandQueue, SparseM, 1, NULL, WorkSize, lWorkSize, 0, NULL, NULL); float HostOutputVector[4]; clEnqueueReadBuffer(GPUCommandQueue, GPUOut, CL_TRUE, 0, 4 * sizeof(float), HostOutputVector, 0, NULL, NULL); printf("Status----->%d\n\n",temp); for(int i=0;i<4;i++){ printf("%f",HostOutputVector); } free(clSelectedPlatformID); free(clDevices); free(GPUDevices); clReleaseKernel(SparseM); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(GPUCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUrow); clReleaseMemObject(GPUVal); clReleaseMemObject(GPUOut); clReleaseMemObject(GPUcol); //clReleaseMemObject(GPUin); clReleaseMemObject(GPUnz); for(int i=0;i<4;i++){ printf("%f",HostOutputVector); } return 0; }
jaidotsh,
Does this problem still exist?
I think you are able to track down your problem by properly collecting status from all OpenCL functions and checking them to pinpoint the issue.
Himanshu,
I figured out the problem, it's fairly simple. It was my fault, I have allocated only 4* sizeof(float) to the readBuffer, i should have allocated 8*sizeof(float).