12 Replies Latest reply on Feb 1, 2011 2:15 PM by ibird

    clEnqueueCopyBuffer event_wait_list

    ibird
      memory leaking, working ?

       

      Has been implemented event_wait_list on clEnqueueCopyBuffer ?

       

      I has problem with memory leaking, crashes (on GPU), and in general do not waiting, when i set num_event_wait_list != 0 with a valid event list

       

        • clEnqueueCopyBuffer event_wait_list
          himanshu.gautam

          Its better to specify your system configuration and provide more details of what you were doing.

          Providing a testcase can enable us to quickly identify the issue and fix it.

            • clEnqueueCopyBuffer event_wait_list
              ibird

               

              Originally posted by: himanshu.gautam Its better to specify your system configuration and provide more details of what you were doing.

               

              Providing a testcase can enable us to quickly identify the issue and fix it.

               

               

              Ok i will do a test case

              • clEnqueueCopyBuffer event_wait_list
                ibird

                Ok this is a test case

                 

                I am on Ubuntu 9.04 with SDK 2.3 Catalyst 10.12 and an ATI 5870

                 

                Now this is a test case where using clEnqueueCopyBuffer generate memory leak, (event wait list must be uncommented).

                 

                Another question, why commenting one of the clWaitForEvents(NSTEP,(cl_event *)&sync_step), without the use of event wait list i has memory leak ?

                i has also leaking with clEnqueueWaitForEvents(commandQueue[0],1,&sync_step); so i start to think , there is something i do not understand on events and waiting

                 

                #define RESULT_OK 1 #define RESULT_FAIL -1 #include <CL/cl.h> #include <iostream> #include <string.h> #define ATI_PLATFORM 0 #define ATI_DEVICE_1 0 #define ATI_DEVICE_2 1 #define BUFFER_SIZE 512 #define NSTEP 128 char kernel1[] = "__kernel void addone(__global float *buf)\ {\ buf += 1;\ }"; char kernel2[] = "__kernel void addtwo(__global float *buf)\ {\ buf += 2;\ }"; int main() { int id_dev = 0; cl_int status = 0; cl_uint nplat = 0; /* Get number of platforms */\ status = clGetPlatformIDs(0,NULL,&nplat);\ if(status != CL_SUCCESS)\ { std::cerr << "Error: clGetPlatformIDs error \n"; return -1; } /* Get all platforms ID */ cl_platform_id * platformst = new cl_platform_id [nplat]; status = clGetPlatformIDs(nplat,platformst,NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clGetPlatformIDs error \n"; return -1; } /* Get devices */ unsigned int num_devices; status = clGetDeviceIDs(platformst[ATI_PLATFORM],CL_DEVICE_TYPE_ALL,0,NULL,&num_devices); if (status != CL_SUCCESS) { std::cerr << "Error: clGetDeviceIDs error \n"; return -1; } /* Get all devices ID in a platform */ cl_device_id * devicest = new cl_device_id [num_devices]; status = clGetDeviceIDs(platformst[ATI_PLATFORM],CL_DEVICE_TYPE_ALL,num_devices,devicest,NULL); if (status != CL_SUCCESS) { std::cerr << "Error: clGetDeviceIDs error \n"; return -1; } /* create a context */ cl_context_properties prop[3]; prop[0] = CL_CONTEXT_PLATFORM; prop[1] = (cl_context_properties)platformst[ATI_PLATFORM]; prop[2] = 0; cl_device_id * devid = new cl_device_id [num_devices]; for (int j = 0 ; j < (int)num_devices ; j++) { devid[j] = devicest[j]; } cl_context context = clCreateContext(prop, num_devices, devid, NULL, NULL ,&status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateContext error \n"; return -1; } /* Load and Build Kernel */ cl_program prg1; cl_program prg2; size_t sourceSize1 = strnlen(kernel1,512); size_t sourceSize2 = strnlen(kernel2,512); char * source = kernel1; prg1 = clCreateProgramWithSource(context, 1, (const char **)&source, &sourceSize1, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateProgramWithSource error \n"; return -1; } status = clBuildProgram(prg1, 1, &devicest[ATI_DEVICE_1], NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clBuildProgram error \n"; return -1; } source = kernel2; prg2 = clCreateProgramWithSource(context, 1, (const char **)&source, &sourceSize2, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateProgramWithSource error \n"; return -1; } status = clBuildProgram(prg2, 1, &devicest[ATI_DEVICE_2], NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clBuildProgram error \n"; return -1; } /* for one device in a platform we create 2 commandQueue */ cl_command_queue * commandQueue = new cl_command_queue [2]; commandQueue[0] = clCreateCommandQueue(context, devicest[ATI_DEVICE_1], 0, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } commandQueue[1] = clCreateCommandQueue(context, devicest[ATI_DEVICE_2], 0, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } /* Create a buffer 1 queue 1*/ float * buffer1 = new float [BUFFER_SIZE]; cl_mem buffer1d = clCreateBuffer(context, CL_MEM_READ_WRITE, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* Create a buffer 2 queue 2 */ float * buffer2 = new float [BUFFER_SIZE]; cl_mem buffer2d = clCreateBuffer(context, CL_MEM_READ_WRITE, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* Create a buffer 3 host */ float * buffer3 = new float [BUFFER_SIZE]; cl_mem buffer3d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* buffer 1 and 2 is on the same device, but is not important, we emulate a multidevice */ cl_event sync_step[NSTEP]; cl_event event_list[NSTEP]; /* We do a (fake) multi device comunication */ /* * we start to copy the buffer from device1 to host then from host to device2, we must wait device 1 finish transfert on buffer 3, so * we use event wait list. then we redo the step (now device 1 now must wait device 2 to complete the transfert, so we use again an event wait * list ) */ while (1) { for (unsigned int i = 0 ; i < NSTEP ; i++) { if (i == 0) status = clEnqueueCopyBuffer(commandQueue[0],buffer1d,buffer3d,0,0,BUFFER_SIZE*sizeof(cl_float),0,NULL,&sync_step[i]); else status = clEnqueueCopyBuffer(commandQueue[0],buffer1d,buffer3d,0,0,BUFFER_SIZE*sizeof(cl_float),0/*1*/,NULL/*&event_list[i-1]*/,&sync_step[i]); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } status = clEnqueueCopyBuffer(commandQueue[1],buffer3d,buffer2d,0,0,BUFFER_SIZE*sizeof(cl_float),0/*1*/,NULL/*&sync_step[i]*/,&event_list[i]); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } } /* Flush */ clFlush(commandQueue[0]); clFlush(commandQueue[1]); /* clFinish(commandQueue[0]); clFinish(commandQueue[1]);*/ clWaitForEvents(NSTEP,(cl_event *)&event_list); clWaitForEvents(NSTEP,(cl_event *)&sync_step); /* Release Sync and Step event */ for (unsigned int i = 0 ; i < NSTEP ; i++) { clReleaseEvent(sync_step[i]); clReleaseEvent(event_list[i]); } } /* Release All */ clReleaseMemObject(buffer1d); clReleaseMemObject(buffer2d); clReleaseMemObject(buffer3d); clReleaseCommandQueue(commandQueue[0]); clReleaseCommandQueue(commandQueue[1]); clReleaseProgram(prg1); clReleaseProgram(prg2); delete [] devid; delete [] devicest; delete [] platformst; return 0; }

                  • clEnqueueCopyBuffer event_wait_list
                    bubu

                    clWaitForEvents(NSTEP,(cl_event *)&event_list);
                    clWaitForEvents(NSTEP,(cl_event *)&sync_step);

                     

                    Nope! It should be:

                     

                    clWaitForEvents(NSTEP,event_list);

                    clWaitForEvents(NSTEP,sync_step);

                     

                    as "event_list" and "sync_step" are arrays...

                     

                    And, btw, I see you use C++ ( #include <iostream> )... so why you're using C arrays and not a std::vector? that can help you to detect buffer overruns...

                    And btw, there's an official C++ OpenCL wrapper with RIIA.

                      • clEnqueueCopyBuffer event_wait_list
                        ibird

                         

                        Originally posted by: bubu clWaitForEvents(NSTEP,(cl_event *)&event_list); clWaitForEvents(NSTEP,(cl_event *)&sync_step);

                         

                         

                         

                        Nope! It should be:

                         

                         

                         

                        clWaitForEvents(NSTEP,event_list);

                         

                        clWaitForEvents(NSTEP,sync_step);

                         

                        You are in theory right, but in practice no. Infact In this situation &event_list  and event_list has the same value, change the code does not change the behaviour, and is unusefull.

                         

                         

                         

                         

                         

                         

                        And, btw, I see you use C++ ( #include  )... so why you're using C arrays and not a std::vector? that can help you to detect buffer overruns...

                         

                        And btw, there's an official C++ OpenCL wrapper with RIIA.

                         



                         

                        This is only a "simple" test case, for me "simple" mean avoid wrapper, vector or any unusefull things.

                         

                  • clEnqueueCopyBuffer event_wait_list
                    Illusio

                    Depending on the version of the SDK, the asynchronous event may hold its reference to your buffer until the event is released with clReleaseEvent. It might be worth checking that you release the events properly if you're getting memory leaks.

                     

                      • clEnqueueCopyBuffer event_wait_list
                        ibird

                         

                        Originally posted by: Illusio Depending on the version of the SDK, the asynchronous event may hold its reference to your buffer until the event is released with clReleaseEvent. It might be worth checking that you release the events properly if you're getting memory leaks.

                         

                         

                         

                         

                        I am releasing all events, infact avoiding the wait list the program generate bad result but the has no memory leaks, considering that the use of event wait list do not generate new events is not possible i am not releasing events

                         

                      • clEnqueueCopyBuffer event_wait_list
                        MicahVillmow
                        ibird,
                        That is not correct, event_list and &event_list[0] would have the same value, &event_list is taking the address of the pointer, not the value of the pointer itself.
                        So &event_list is of type cl_event*[NSTEP], not cl_event*.

                        So you are passing the wrong value to the clWaitForEvents.
                          • clEnqueueCopyBuffer event_wait_list
                            ibird


                            Originally posted by: MicahVillmow ibird, That is not correct, event_list and &event_list[0] would have the same value, &event_list is taking the address of the pointer, not the value of the pointer itself. So &event_list is of type cl_event*[NSTEP], not cl_event*. So you are passing the wrong value to the clWaitForEvents.


                            event_list is an array of cl_event , so an array of pointers to struct __cl_event, event_list  just give the adress of the array, but there in not the adress of the pointer event_list, so in this situation does not change. ( I has NOT allocated event list with cl_event * event_list = new cl_event[NSTEP], where we have a pointer object that use memory space , so an address of the pointer , but with cl_event event_list[NSTEP] where we have only the array event_list, and only the address of the array, there is not an object pointer "in memory" that store the address where i can take the address of the pointer with & ). So again you are formaly right, but not in practice in this situation

                            I can see this at runtime and on assembly code

                            Unassembling

                                    clWaitForEvents(NSTEP,event_list);

                            i get this

                                     0x08049251 : lea    0x38(%esp),%eax

                                     0x08049255 : mov    %eax,0x4(%esp)

                                      0x08049259 : movl   $0x80,(%esp)

                                      0x08049260 : call   0x8048888


                            from


                                        clWaitForEvents(NSTEP,(cl_event *)&event_list);     


                             i get  this:


                                           0x08049265 : lea    0x38(%esp),%eax

                                           0x08049269 : mov    %eax,0x4(%esp)

                                           0x0804926d : movl   $0x80,(%esp)      

                                           0x08049274 : call   0x8048888


                            There is no difference, and running the code i am passing the same address,  i am sure because i am seeing this from the debugger also at runtime in the two cases i am passing the same address.


                            But i do not want talk about this.

                            So i has definitively changed my code into this ( i has attached the code)

                                    clWaitForEvents(NSTEP,event_list);
                                    clWaitForEvents(NSTEP,sync_step);

                            and redone all test. I still has memory leaking when i try to use num event wait, i has memory leaking when i use clEnqueueWaitForEvents, i has memory leak when i comment clWaitForEvents(NSTEP,sync_step), where is with the modified code my mistake ?

                            #define RESULT_OK 1 #define RESULT_FAIL -1 #include <CL/cl.h> #include <iostream> #include <string.h> #define ATI_PLATFORM 0 #define ATI_DEVICE_1 0 #define ATI_DEVICE_2 1 #define BUFFER_SIZE 512 #define NSTEP 128 struct test { unsigned int x; unsigned int y; }; char kernel1[] = "__kernel void addone(__global float *buf)\ {\ buf += 1;\ }"; char kernel2[] = "__kernel void addtwo(__global float *buf)\ {\ buf += 2;\ }"; int main() { int id_dev = 0; cl_int status = 0; cl_uint nplat = 0; /* Get number of platforms */\ status = clGetPlatformIDs(0,NULL,&nplat);\ if(status != CL_SUCCESS)\ { std::cerr << "Error: clGetPlatformIDs error \n"; return -1; } /* Get all platforms ID */ cl_platform_id * platformst = new cl_platform_id [nplat]; status = clGetPlatformIDs(nplat,platformst,NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clGetPlatformIDs error \n"; return -1; } /* Get devices */ unsigned int num_devices; status = clGetDeviceIDs(platformst[ATI_PLATFORM],CL_DEVICE_TYPE_ALL,0,NULL,&num_devices); if (status != CL_SUCCESS) { std::cerr << "Error: clGetDeviceIDs error \n"; return -1; } /* Get all devices ID in a platform */ cl_device_id * devicest = new cl_device_id [num_devices]; status = clGetDeviceIDs(platformst[ATI_PLATFORM],CL_DEVICE_TYPE_ALL,num_devices,devicest,NULL); if (status != CL_SUCCESS) { std::cerr << "Error: clGetDeviceIDs error \n"; return -1; } /* create a context */ cl_context_properties prop[3]; prop[0] = CL_CONTEXT_PLATFORM; prop[1] = (cl_context_properties)platformst[ATI_PLATFORM]; prop[2] = 0; cl_device_id * devid = new cl_device_id [num_devices]; for (int j = 0 ; j < (int)num_devices ; j++) { devid[j] = devicest[j]; } cl_context context = clCreateContext(prop, num_devices, devid, NULL, NULL ,&status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateContext error \n"; return -1; } /* Load and Build Kernel */ cl_program prg1; cl_program prg2; size_t sourceSize1 = strnlen(kernel1,512); size_t sourceSize2 = strnlen(kernel2,512); char * source = kernel1; prg1 = clCreateProgramWithSource(context, 1, (const char **)&source, &sourceSize1, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateProgramWithSource error \n"; return -1; } status = clBuildProgram(prg1, 1, &devicest[ATI_DEVICE_1], NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clBuildProgram error \n"; return -1; } source = kernel2; prg2 = clCreateProgramWithSource(context, 1, (const char **)&source, &sourceSize2, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateProgramWithSource error \n"; return -1; } status = clBuildProgram(prg2, 1, &devicest[ATI_DEVICE_2], NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clBuildProgram error \n"; return -1; } /* for one device in a platform we create 2 commandQueue */ cl_command_queue * commandQueue = new cl_command_queue [2]; commandQueue[0] = clCreateCommandQueue(context, devicest[ATI_DEVICE_1], 0, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } commandQueue[1] = clCreateCommandQueue(context, devicest[ATI_DEVICE_2], 0, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } /* Create a buffer 1 queue 1*/ float * buffer1 = new float [BUFFER_SIZE]; cl_mem buffer1d = clCreateBuffer(context, CL_MEM_READ_WRITE, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* Create a buffer 2 queue 2 */ float * buffer2 = new float [BUFFER_SIZE]; cl_mem buffer2d = clCreateBuffer(context, CL_MEM_READ_WRITE, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* Create a buffer 3 host */ float * buffer3 = new float [BUFFER_SIZE]; cl_mem buffer3d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* buffer 1 and 2 is on the same device, but is not important, we emulate a multidevice */ cl_event sync_step[NSTEP]; cl_event event_list[NSTEP]; /* We do a (fake) multi device comunication */ /* * we start to copy the buffer from device1 to host then from host to device2, we must wait device 1 finish transfert on buffer 3, so * we use event wait list. then we redo the step (now device 1 now must wait device 2 to complete the transfert, so we use again an event wait * list ) */ while (1) { for (unsigned int i = 0 ; i < NSTEP ; i++) { if (i == 0) { status = clEnqueueCopyBuffer(commandQueue[0],buffer1d,buffer3d,0,0,BUFFER_SIZE*sizeof(cl_float),0,NULL,&sync_step[i]); } else { // clEnqueueWaitForEvents(commandQueue[0],1,&event_list[i-1]); status = clEnqueueCopyBuffer(commandQueue[0],buffer1d,buffer3d,0,0,BUFFER_SIZE*sizeof(cl_float),0/*1*/,NULL/*&event_list[i-1]*/,&sync_step[i]); } if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } // clEnqueueWaitForEvents(commandQueue[0],1,&sync_step[i]); status = clEnqueueCopyBuffer(commandQueue[1],buffer3d,buffer2d,0,0,BUFFER_SIZE*sizeof(cl_float),0/*1*/,NULL/*&sync_step[i]*/,&event_list[i]); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } } /* Flush */ clFlush(commandQueue[0]); clFlush(commandQueue[1]); /* clFinish(commandQueue[0]); clFinish(commandQueue[1]);*/ clWaitForEvents(NSTEP,event_list); clWaitForEvents(NSTEP,sync_step); /* Release Sync and Step event */ for (unsigned int i = 0 ; i < NSTEP ; i++) { clReleaseEvent(sync_step[i]); clReleaseEvent(event_list[i]); } } /* Release All */ clReleaseMemObject(buffer1d); clReleaseMemObject(buffer2d); clReleaseMemObject(buffer3d); clReleaseCommandQueue(commandQueue[0]); clReleaseCommandQueue(commandQueue[1]); clReleaseProgram(prg1); clReleaseProgram(prg2); delete [] devid; delete [] devicest; delete [] platformst; return 0; }

                              • clEnqueueCopyBuffer event_wait_list
                                himanshu.gautam

                                I dont find anything strange in the code except it being looping in an infinite loop and hanging forever.

                                  • clEnqueueCopyBuffer event_wait_list
                                    ibird

                                    I has done other tests.

                                     

                                    The second posted code (also the first) has clEnqueueWait commented so the code do not leak until the clEnqueueWait is uncommented. This code has memory leak also on Nvidia OpenCL, but for unknown reason changing from

                                     

                                          clWaitForEvents(NSTEP,event_list);
                                          clWaitForEvents(NSTEP,sync_step);

                                    to

                                          for (unsigned int i = 0 ; i < NSTEP ; i++)
                                          {
                                        collect_list[2*i]   = sync_step;
                                        collect_list[2*i+1] = event_list
                                    ;
                                          }

                                          clWaitForEvents(2*NSTEP,collect_list);

                                    so collecting all events into one event wait call, the memory leak disapper on Nvidia OpenCL, testing this ( fix ? ) on ATI doesn't  work, and continue to leak, the infinite loop is done in order so see memory growing up

                                    If someone has an idea or hint on how to fix this simple code on ATI is welcome


                                    #define RESULT_OK 1 #define RESULT_FAIL -1 #include <CL/cl.h> #include <iostream> #include <string.h> #define ATI_PLATFORM 0 #define ATI_DEVICE_1 0 #define ATI_DEVICE_2 0 #define BUFFER_SIZE 512 #define NSTEP 128 struct test { unsigned int x; unsigned int y; }; char kernel1[] = "__kernel void addone(__global float *buf)\ {\ buf += 1;\ }"; char kernel2[] = "__kernel void addtwo(__global float *buf)\ {\ buf += 2;\ }"; int main() { int id_dev = 0; cl_int status = 0; cl_uint nplat = 0; /* Get number of platforms */\ status = clGetPlatformIDs(0,NULL,&nplat);\ if(status != CL_SUCCESS)\ { std::cerr << "Error: clGetPlatformIDs error \n"; return -1; } /* Get all platforms ID */ cl_platform_id * platformst = new cl_platform_id [nplat]; status = clGetPlatformIDs(nplat,platformst,NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clGetPlatformIDs error \n"; return -1; } /* Get devices */ unsigned int num_devices; status = clGetDeviceIDs(platformst[ATI_PLATFORM],CL_DEVICE_TYPE_ALL,0,NULL,&num_devices); if (status != CL_SUCCESS) { std::cerr << "Error: clGetDeviceIDs error \n"; return -1; } /* Get all devices ID in a platform */ cl_device_id * devicest = new cl_device_id [num_devices]; status = clGetDeviceIDs(platformst[ATI_PLATFORM],CL_DEVICE_TYPE_ALL,num_devices,devicest,NULL); if (status != CL_SUCCESS) { std::cerr << "Error: clGetDeviceIDs error \n"; return -1; } /* create a context */ cl_context_properties prop[3]; prop[0] = CL_CONTEXT_PLATFORM; prop[1] = (cl_context_properties)platformst[ATI_PLATFORM]; prop[2] = 0; cl_device_id * devid = new cl_device_id [num_devices]; for (int j = 0 ; j < (int)num_devices ; j++) { devid[j] = devicest[j]; } cl_context context = clCreateContext(prop, num_devices, devid, NULL, NULL ,&status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateContext error \n"; return -1; } /* Load and Build Kernel */ cl_program prg1; cl_program prg2; size_t sourceSize1 = strnlen(kernel1,512); size_t sourceSize2 = strnlen(kernel2,512); char * source = kernel1; prg1 = clCreateProgramWithSource(context, 1, (const char **)&source, &sourceSize1, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateProgramWithSource error \n"; return -1; } status = clBuildProgram(prg1, 1, &devicest[ATI_DEVICE_1], NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clBuildProgram error \n"; return -1; } source = kernel2; prg2 = clCreateProgramWithSource(context, 1, (const char **)&source, &sourceSize2, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateProgramWithSource error \n"; return -1; } status = clBuildProgram(prg2, 1, &devicest[ATI_DEVICE_2], NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cerr << "Error: clBuildProgram error \n"; return -1; } /* for one device in a platform we create 2 commandQueue */ cl_command_queue * commandQueue = new cl_command_queue [2]; commandQueue[0] = clCreateCommandQueue(context, devicest[ATI_DEVICE_1], 0, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } commandQueue[1] = clCreateCommandQueue(context, devicest[ATI_DEVICE_2], 0, &status); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } /* Create a buffer 1 queue 1*/ float * buffer1 = new float [BUFFER_SIZE]; cl_mem buffer1d = clCreateBuffer(context, CL_MEM_READ_WRITE, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* Create a buffer 2 queue 2 */ float * buffer2 = new float [BUFFER_SIZE]; cl_mem buffer2d = clCreateBuffer(context, CL_MEM_READ_WRITE, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* Create a buffer 3 host */ float * buffer3 = new float [BUFFER_SIZE]; cl_mem buffer3d = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, BUFFER_SIZE*sizeof(cl_float), NULL, &status); /* buffer 1 and 2 is on the same device, but is not important, we emulate a multidevice */ cl_event sync_step[NSTEP]; cl_event event_list[NSTEP]; cl_event collect_list[2*NSTEP]; /* We do a (fake) multi device comunication */ /* * we start to copy the buffer from device1 to host then from host to device2, we must wait device 1 finish transfert on buffer 3, so * we use event wait list. then we redo the step (now device 1 now must wait device 2 to complete the transfert, so we use again an event wait * list ) */ while (1) { for (unsigned int i = 0 ; i < NSTEP ; i++) { if (i == 0) { status = clEnqueueCopyBuffer(commandQueue[0],buffer1d,buffer3d,0,0,BUFFER_SIZE*sizeof(cl_float),0,NULL,&sync_step[i]); } else { clEnqueueWaitForEvents(commandQueue[0],1,&event_list[i-1]); status = clEnqueueCopyBuffer(commandQueue[0],buffer1d,buffer3d,0,0,BUFFER_SIZE*sizeof(cl_float),0/*1*/,NULL/*&event_list[i-1]*/,&sync_step[i]); } if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } clEnqueueWaitForEvents(commandQueue[0],1,&sync_step[i]); status = clEnqueueCopyBuffer(commandQueue[1],buffer3d,buffer2d,0,0,BUFFER_SIZE*sizeof(cl_float),0/*1*/,NULL/*&sync_step[i]*/,&event_list[i]); if(status != CL_SUCCESS) { std::cerr << "Error: clCreateCommandQueue error \n"; return -1; } } /* Flush */ clFlush(commandQueue[0]); clFlush(commandQueue[1]); /* clFinish(commandQueue[0]); clFinish(commandQueue[1]);*/ for (unsigned int i = 0 ; i < NSTEP ; i++) { collect_list[2*i] = sync_step[i]; collect_list[2*i+1] = event_list[i]; } /* clWaitForEvents(NSTEP,event_list); clWaitForEvents(NSTEP,sync_step);*/ clWaitForEvents(2*NSTEP,collect_list); /* Release Sync and Step event */ for (unsigned int i = 0 ; i < NSTEP ; i++) { clReleaseEvent(sync_step[i]); clReleaseEvent(event_list[i]); } usleep(100000); } /* Release All */ clReleaseMemObject(buffer1d); clReleaseMemObject(buffer2d); clReleaseMemObject(buffer3d); clReleaseCommandQueue(commandQueue[0]); clReleaseCommandQueue(commandQueue[1]); clReleaseProgram(prg1); clReleaseProgram(prg2); delete [] devid; delete [] devicest; delete [] platformst; return 0; }