1 Reply Latest reply on Feb 12, 2016 3:56 AM by amdmatt

    Does AMD A10 APU support concurrent kernel execution?

    johnspaul92

      I have an AMD A10 APU with Radeon R7 GPU. I believe this device supportes concurrent kernel execution. But when i wrote the following code and obtained profiling information it doesnt seem like the kernels are executing concurrently. My openCL code is :

       

      for(j = 0; j < 8; j++){

           cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER1_KERNEL],1,NULL,&globalSize,&localSize,4,eventList,&eventList[4 + j * 4]); //Invoking the first filter kernel

        cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER2_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 4 + 4 * j,&eventList[5 + j * 4]); //Invoking the second filter kernel

        cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][FILTER3_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 5 + 4 * j,&eventList[6 + j * 4]); //Invoking the third filter kernel

        cl_err = clEnqueueNDRangeKernel(queue[4 + j],kernel[Q6_PROGRAM_ID][AGGREGATE_KERNEL],1,NULL,&globalSize,&localSize,1,eventList + 6 + 4 * j,&eventList[7 + j * 4]); //Invoking the aggregate kernel

      }

       

      The code i used for profiling is :

       

      for(j = 0; j < 8; j++){

        //Code for obtaining the profiling data

        clWaitForEvents(4 + 4*j, eventList+4);

        clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);

        clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);

        clGetEventProfilingInfo(eventList[4 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);

        total_time = ((double)time_end - time_start)/1000000;

        total_time_queued = ((double)time_end - time_start_queued)/1000000;

        final_time += total_time;

        final_time_queued += total_time_queued;

       

       

        cout<<"\n1 : "<<time_start<<" "<<time_end;

       

       

        clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);

        clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);

        clGetEventProfilingInfo(eventList[5 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);

        total_time = ((double)time_end - time_start)/1000000;

        total_time_queued = ((double)time_end - time_start_queued)/1000000;

        final_time += total_time;

        final_time_queued += total_time_queued;

       

       

        cout<<"\n2 : "<<time_start<<" "<<time_end;

       

       

        clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);

        clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);

        clGetEventProfilingInfo(eventList[6 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);

        total_time = ((double)time_end - time_start)/1000000;

        total_time_queued = ((double)time_end - time_start_queued)/1000000;

        final_time += total_time;

        final_time_queued += total_time_queued;

       

       

        cout<<"\n3 : "<<time_start<<" "<<time_end;

       

       

        clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_QUEUED, sizeof(time_start_queued), &time_start_queued, NULL);

        clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);

        clGetEventProfilingInfo(eventList[7 + j * 4], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);

        total_time = ((double)time_end - time_start)/1000000;

        total_time_queued = ((double)time_end - time_start_queued)/1000000;

        final_time += total_time;

        final_time_queued += total_time_queued;

       

       

        cout<<"\n4 : "<<time_start<<" "<<time_end;

        }

       

      The output of my profiling code is :

       

      Time Spent: 0.266

      1 : 3989633359630 3989657015190

      2 : 3989657016860 3989683273450

      3 : 3989683275090 3989708840030

      4 : 3989708841760 3989734915610

      1 : 3989800219990 3989824648510

      2 : 3989824650240 3989850888860

      3 : 3989850890610 3989876392210

      4 : 3989876393890 3989902432920

      1 : 3989954275546 3989978865766

      2 : 3989978867476 3990005037296

      3 : 3990005038976 3990030592876

      4 : 3990030594566 3990056566896

      1 : 3990113144067 3990137315217

      2 : 3990137316937 3990163458337

      3 : 3990163460057 3990189007267

      4 : 3990189008967 3990215129227

      1 : 3990274589700 3990299102730

      2 : 3990299104430 3990325570980

      3 : 3990325572730 3990351050810

      4 : 3990351052550 3990377255070

      1 : 3990424871514 3990448828874

      2 : 3990448830524 3990475309034

      3 : 3990475310744 3990500849914

      4 : 3990500851664 3990526839444

      1 : 3990584574567 3990608802017

      2 : 3990608803727 3990635102497

      3 : 3990635104427 3990660647987

      4 : 3990660649697 3990686716887

      1 : 3990733269328 3990757174868

      2 : 3990757176588 3990783429448

      3 : 3990783431118 3990809003598

      4 : 3990809005298 3990835207128

       

       

      Final Time : 816.718

      Final Time Queued: 26274.4

      PROGRAM EXECUTION OVER