8 Replies Latest reply on Dec 15, 2010 2:22 AM by jski

    Need SIMPLE OpenCL example

    jski

      I downloaded the OpenCL SDK but failed to get a "samples" directory/folder for whatever reason(?).

      I'm trying to get  the SDK to work using Dev-C++ (MinGW).  I've used the reimp.exe utility to build libopencl.a and now want to test the environment.

      Can someone post a simple example (in C preferably) so that I may test it?

      ---John

        • Need SIMPLE OpenCL example
          jski

          #include <fcntl.h>
          #include <stdio.h>
          #include <stdlib.h>
          #include <string.h>
          #include <math.h>
          #include <unistd.h>
          #include <sys/types.h>
          #include <sys/stat.h>
          #include <CL/opencl.h>
          #include <CL/cl.h>

          ////////////////////////////////////////////////////////////////////////////////

          // Use a static data size for simplicity
          //
          #define DATA_SIZE (1024)

          ////////////////////////////////////////////////////////////////////////////////

          // Simple compute kernel which computes the square of an input array
          //
          const char *KernelSource = "\n" \
          "__kernel square( \n" \
          " __global float* input, \n" \
          " __global float* output, \n" \
          " const unsigned int count) \n" \
          "{ \n" \
          " int i = get_global_id(0); \n" \
          " if(i < count) \n" \
          " output = input * input; \n" \
          "} \n" \
          "\n";

          ////////////////////////////////////////////////////////////////////////////////

          int main(int argc, char** argv)
          {
          int err; // error code returned from api calls

          float data[DATA_SIZE]; // original data set given to device
          float results[DATA_SIZE]; // results returned from device
          unsigned int correct; // number of correct results returned

          size_t global; // global domain size for our calculation
          size_t local; // local domain size for our calculation

          cl_uint num_devices;
          cl_device_id devices[1];

          cl_device_id device_id; // compute device id
          cl_context context; // compute context
          cl_command_queue commands; // compute command queue
          cl_program program; // compute program
          cl_kernel kernel; // compute kernel

          cl_mem input; // device memory used for the input array
          cl_mem output; // device memory used for the output array

          // Fill our data set with random float values
          //
          int i = 0;
          unsigned int count = DATA_SIZE;
          for(i = 0; i < count; i++)
          data
          = rand() / (float)RAND_MAX;

          // Connect to a compute device
          //
          int gpu = 0;
          //err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
          err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &devices[0], &num_devices);
          if (err != CL_SUCCESS)
          {
          printf("Error: Failed to create a device group: %d\n", err);
          return EXIT_FAILURE;
          }

          // Create a compute context
          //
          context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
          if (!context)
          {
          printf("Error: Failed to create a compute context!\n");
          return EXIT_FAILURE;
          }

          // Create a command commands
          //
          commands = clCreateCommandQueue(context, device_id, 0, &err);
          if (!commands)
          {
          printf("Error: Failed to create a command commands!\n");
          return EXIT_FAILURE;
          }

          // Create the compute program from the source buffer
          //
          program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
          if (!program)
          {
          printf("Error: Failed to create compute program!\n");
          return EXIT_FAILURE;
          }

          // Build the program executable
          //
          err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
          if (err != CL_SUCCESS)
          {
          size_t len;
          char buffer[2048];

          printf("Error: Failed to build program executable!\n");
          clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
          printf("%s\n", buffer);
          exit(1);
          }

          // Create the compute kernel in the program we wish to run
          //
          kernel = clCreateKernel(program, "square", &err);
          if (!kernel || err != CL_SUCCESS)
          {
          printf("Error: Failed to create compute kernel!\n");
          exit(1);
          }

          // Create the input and output arrays in device memory for our calculation
          //
          input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
          output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
          if (!input || !output)
          {
          printf("Error: Failed to allocate device memory!\n");
          exit(1);
          }

          // Write our data set into the input array in device memory
          //
          err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
          if (err != CL_SUCCESS)
          {
          printf("Error: Failed to write to source array!\n");
          exit(1);
          }

          // Set the arguments to our compute kernel
          //
          err = 0;
          err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
          err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
          err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
          if (err != CL_SUCCESS)
          {
          printf("Error: Failed to set kernel arguments! %d\n", err);
          exit(1);
          }

          // Get the maximum work group size for executing the kernel on the device
          //
          err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
          if (err != CL_SUCCESS)
          {
          printf("Error: Failed to retrieve kernel work group info! %d\n", err);
          exit(1);
          }

          // Execute the kernel over the entire range of our 1d input data set
          // using the maximum number of work group items for this device
          //
          global = count;
          err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
          if (err)
          {
          printf("Error: Failed to execute kernel!\n");
          return EXIT_FAILURE;
          }

          // Wait for the command commands to get serviced before reading back results
          //
          clFinish(commands);

          // Read back the results from the device to verify the output
          //
          err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
          if (err != CL_SUCCESS)
          {
          printf("Error: Failed to read output array! %d\n", err);
          exit(1);
          }

          // Validate our results
          //
          correct = 0;
          for(i = 0; i < count; i++)
          {
          if(results == data * data)
          correct++;
          }

          // Print a brief summary detailing the results
          //
          printf("Computed '%d/%d' correct values!\n", correct, count);

          // Shutdown and cleanup
          //
          clReleaseMemObject(input);
          clReleaseMemObject(output);
          clReleaseProgram(program);
          clReleaseKernel(kernel);
          clReleaseCommandQueue(commands);
          clReleaseContext(context);

          return 0;
          }

            • Need SIMPLE OpenCL example
              jski

              I tried this but got clGetDeviceIDs(...) returning -32 ?

              I'm trying to run this on an AMD CPU (the GPU is a pukey nVidia).

              Does -32 tell me something specific?

                • Need SIMPLE OpenCL example
                  eklund.n

                  -32 = INVALID PLATFORM. You need to get a platform before clGetDeviceIDs, and add that as the first argument to clGetDeviceIDs. If it's NULL its behavior is implementation-defined.

                  then the kernel is wrong, and some other places. the [ i ] is interpreted as BB-code and removed. use the "Attach Code" button.

                  output[i] = input[i]*input[i]; data[i] = rand()/(float)RAND_MAX;

                  • Need SIMPLE OpenCL example
                    himanshu.gautam

                    jski,

                    Did you do express installation or custom installation?

                    Anyhow you can find a separate link for installing samples package at

                    C:\ATI\SUPPORT\streamsdk_2-2\Packages\Apps.

                    I think eklund.n explained the problem in your code well. You can refer to cl.h file in

                    C:\Program Files (x86)\ATI Stream\include\CL to find the meaning of any error code you get from any openCL API.

                      • Need SIMPLE OpenCL example
                        jski

                        I did the express install.

                          • Need SIMPLE OpenCL example
                            nou

                            passing NULL pointer to clCreateContext is deprectated. and clGetPlatformsIDs should be in cl.h http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetPlatformIDs.html

                              • Need SIMPLE OpenCL example
                                jski

                                Got it:

                                #include <fcntl.h>
                                #include <stdio.h>
                                #include <stdlib.h>
                                #include <string.h>
                                #include <math.h>
                                #include <unistd.h>
                                #include <sys/types.h>
                                #include <sys/stat.h>
                                #include <CL/opencl.h>
                                #include <CL/cl.h>

                                ////////////////////////////////////////////////////////////////////////////////

                                // Use a static data size for simplicity
                                //
                                #define DATA_SIZE (1024)

                                ////////////////////////////////////////////////////////////////////////////////

                                // Simple compute kernel which computes the square of an input array
                                //
                                const char *KernelSource = "\n" \
                                "__kernel void square( \n" \
                                " __global float* input, \n" \
                                " __global float* output, \n" \
                                " const unsigned int count) \n" \
                                "{ \n" \
                                " int i = get_global_id(0); \n" \
                                " if(i < count) \n" \
                                " output = input * input; \n" \
                                "} \n" \
                                "\n";

                                ////////////////////////////////////////////////////////////////////////////////

                                int main(int argc, char* argv[])
                                {
                                cl_int err; // error code returned from api calls

                                float data[DATA_SIZE]; // original data set given to device
                                float results[DATA_SIZE]; // results returned from device
                                unsigned int correct; // number of correct results returned

                                size_t global; // global domain size for our calculation
                                size_t local; // local domain size for our calculation

                                cl_uint num_devices;
                                cl_device_id devices[1];

                                cl_context context; // compute context
                                cl_command_queue commands; // compute command queue
                                cl_program program; // compute program
                                cl_kernel kernel; // compute kernel

                                cl_mem input; // device memory used for the input array
                                cl_mem output; // device memory used for the output array

                                // Fill our data set with random float values
                                //
                                int i = 0;
                                unsigned int count = DATA_SIZE;
                                for(i = 0; i < count; i++)
                                data
                                = rand() / (float)RAND_MAX;

                                // Connect to a compute device
                                //
                                int gpu = 0;

                                cl_platform_id platforms;
                                cl_uint num_platforms;

                                err = clGetPlatformIDs(1, &platforms, &num_platforms);
                                if (err != CL_SUCCESS)
                                {
                                fprintf(stderr,"Error: Failed to create platform id: %d\n", err);
                                //return EXIT_FAILURE;
                                }

                                err = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_CPU, 1, &devices[0], &num_devices);
                                if (err != CL_SUCCESS)
                                {
                                fprintf(stderr,"Error: Failed to create a device group: %d\n", err);
                                //return EXIT_FAILURE;
                                }

                                // Create a compute context
                                //
                                context = clCreateContext(NULL, 1, devices, NULL, NULL, &err);
                                if (!context)
                                {
                                printf("Error: Failed to create a compute context: %d\n", err);
                                return EXIT_FAILURE;
                                }

                                // Create a command commands
                                //
                                commands = clCreateCommandQueue(context, devices[0], 0, &err);
                                if (!commands)
                                {
                                printf("Error: Failed to create a command commands: %d\n", err);
                                return EXIT_FAILURE;
                                }

                                // Create the compute program from the source buffer
                                //
                                program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
                                if (!program)
                                {
                                printf("Error: Failed to create compute program: %d\n", err);
                                return EXIT_FAILURE;
                                }

                                // Build the program executable
                                //
                                err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
                                if (err != CL_SUCCESS)
                                {
                                size_t len;
                                char buffer[2048];

                                printf("Error: Failed to build program executable: %d\n", err);
                                clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
                                printf("%s\n", buffer);
                                exit(1);
                                }

                                // Create the compute kernel in the program we wish to run
                                //
                                kernel = clCreateKernel(program, "square", &err);
                                if (!kernel || err != CL_SUCCESS)
                                {
                                printf("Error: Failed to create compute kernel: %d\n", err);
                                exit(1);
                                }

                                // Create the input and output arrays in device memory for our calculation
                                //
                                input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
                                output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
                                if (!input || !output)
                                {
                                printf("Error: Failed to allocate device memory!\n");
                                exit(1);
                                }

                                // Write our data set into the input array in device memory
                                //
                                err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
                                if (err != CL_SUCCESS)
                                {
                                printf("Error: Failed to write to source array!\n");
                                exit(1);
                                }

                                // Set the arguments to our compute kernel
                                //
                                err = 0;
                                err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
                                err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
                                err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
                                if (err != CL_SUCCESS)
                                {
                                printf("Error: Failed to set kernel arguments! %d\n", err);
                                exit(1);
                                }

                                // Get the maximum work group size for executing the kernel on the device
                                //
                                err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
                                if (err != CL_SUCCESS)
                                {
                                printf("Error: Failed to retrieve kernel work group info! %d\n", err);
                                exit(1);
                                }

                                // Execute the kernel over the entire range of our 1d input data set
                                // using the maximum number of work group items for this device
                                //
                                global = count;
                                err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
                                if (err)
                                {
                                printf("Error: Failed to execute kernel!\n");
                                return EXIT_FAILURE;
                                }

                                // Wait for the command commands to get serviced before reading back results
                                //
                                clFinish(commands);

                                // Read back the results from the device to verify the output
                                //
                                err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
                                if (err != CL_SUCCESS)
                                {
                                printf("Error: Failed to read output array! %d\n", err);
                                exit(1);
                                }

                                // Validate our results
                                //
                                correct = 0;
                                for(i = 0; i < count; i++)
                                { float temp;

                                temp = data * data;
                                //if ( i < 12 ) fprintf( stderr, "results[%d]: %g & temp: %g\n", i, results, temp);
                                if(results
                                == temp)
                                correct++;
                                }

                                // Print a brief summary detailing the results
                                //
                                fprintf(stderr,"Computed '%d/%d' correct values!\n", correct, count);

                                // Shutdown and cleanup
                                //
                                clReleaseMemObject(input);
                                clReleaseMemObject(output);
                                clReleaseProgram(program);
                                clReleaseKernel(kernel);
                                clReleaseCommandQueue(commands);
                                clReleaseContext(context);

                                return 0;
                                }