Archives Discussions

jaidotsh · ‎06-30-2011

I'm not able to invoke the kernel from this code, can anybody find my mistake?? The code has no errors. Thanks in advanc

I'm not able to invoke the kernel from this code, can anybody find my mistake?? The code has no errors. Thanks in advance

//Kernel #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_amd_printf:enable __kernel void mkernel(__global float * values,__global int * row_idx, __global int * col_idx, __global float * vec_out,const int num_nonzeros) { printf("Here "); const int __idx = get_global_id(0); printf("Here "); if (__idx >= num_nonzeros) return; printf("Here "); int i = __idx; vec_out=values+1; } //Host #include "stdafx.h" #include<stdio.h> #include<stdlib.h> #include<malloc.h> #include<CL/cl.h> int main(){ int col = 8; int size=col*sizeof(int); float values[] = {2,5,1,8,9,5,3,4};// int row_idx[] = {0,1,2,2,1,2,3,2};// int col_idx[] = {1,0,0,1,3,2,1,3};// //float in[] = {1,1,1,1}; float out[] = {0,0,0,0,0,0,0,0}; int n_nz = 8; /* 0 5 1 0 2 0 8 3 0 0 5 0 0 9 4 0 */ cl_int error = 0; cl_uint numPlatforms; cl_platform_id* clSelectedPlatformID = NULL; clGetPlatformIDs(0, NULL, &numPlatforms); clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms); error = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL); if(error != CL_SUCCESS) return 0; //Device info cl_uint ciDeviceCount; cl_device_id* clDevices = NULL; error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); if(error != CL_SUCCESS) return 0; clDevices = (cl_device_id*) malloc(sizeof(cl_device_id) * ciDeviceCount); error = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, ciDeviceCount, clDevices, &ciDeviceCount); if(error != CL_SUCCESS) return 0; printf("Available Devices: %d.\n",ciDeviceCount); cl_uint device_value = 0; cl_context_properties props[3]; props[0] = (cl_context_properties)CL_CONTEXT_PLATFORM; // indicates that next element is platform props[1] = (cl_context_properties)clSelectedPlatformID[0]; // platform is of type cl_platform_id props[2] = (cl_context_properties)0; // last element must be 0 cl_context GPUContext = clCreateContextFromType(props, CL_DEVICE_TYPE_GPU,NULL, NULL, &error); size_t ParmDataBytes; clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes); cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes); clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL); cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL); cl_mem GPUVal = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*8, values, NULL); cl_mem GPUrow = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*8, row_idx, NULL); cl_mem GPUcol = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*8, col_idx, NULL); //cl_mem GPUin = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)*4, values, NULL); cl_mem GPUOut= clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float)*8, NULL, NULL); cl_mem GPUnz = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &n_nz, NULL); //cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float) * 4, NULL, NULL); FILE *fp; char fileName[] = "spr_ker.cl"; char *source_str; size_t source_size; fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(1000); source_size = fread( source_str, 1, 1000, fp); fclose( fp ); puts(source_str); cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1,(const char **)&source_str,(const size_t *)& source_size, &error); error = clBuildProgram(OpenCLProgram,0, NULL, NULL, NULL, NULL); cl_kernel SparseM = clCreateKernel(OpenCLProgram, "mkernel", NULL); clSetKernelArg(SparseM, 0, sizeof(cl_mem),(void*)&GPUVal); clSetKernelArg(SparseM, 1, sizeof(cl_mem),(void*)&GPUrow); clSetKernelArg(SparseM, 2, sizeof(cl_mem),(void*)&GPUcol); //clSetKernelArg(SparseM, 3, sizeof(cl_mem),(void*)&GPUin); clSetKernelArg(SparseM, 3, sizeof(cl_mem),(void*)&GPUOut); clSetKernelArg(SparseM, 4, sizeof(cl_mem),(void*)&GPUnz); size_t WorkSize[1] = {8}; size_t lWorkSize[1]={1}; cl_int temp = clEnqueueNDRangeKernel(GPUCommandQueue, SparseM, 1, NULL, WorkSize, lWorkSize, 0, NULL, NULL); float HostOutputVector[4]; clEnqueueReadBuffer(GPUCommandQueue, GPUOut, CL_TRUE, 0, 4 * sizeof(float), HostOutputVector, 0, NULL, NULL); printf("Status----->%d\n\n",temp); for(int i=0;i<4;i++){ printf("%f",HostOutputVector); } free(clSelectedPlatformID); free(clDevices); free(GPUDevices); clReleaseKernel(SparseM); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(GPUCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUrow); clReleaseMemObject(GPUVal); clReleaseMemObject(GPUOut); clReleaseMemObject(GPUcol); //clReleaseMemObject(GPUin); clReleaseMemObject(GPUnz); for(int i=0;i<4;i++){ printf("%f",HostOutputVector); } return 0; }

himanshu_gautam · ‎07-20-2011

jaidotsh,

Does this problem still exist?

I think you are able to track down your problem by properly collecting status from all OpenCL functions and checking them to pinpoint the issue.

jaidotsh · ‎07-20-2011

Himanshu,

I figured out the problem, it's fairly simple. It was my fault, I have allocated only 4* sizeof(float) to the readBuffer, i should have allocated 8*sizeof(float).

Archives Discussions

Help with code- Sparse Matrix COO