Cuda Performance for large size problems

Article contains :

    1). Parallel program

    2) Output  

Problem statement : Adds a number X to all elements of a one-dimensional array A. In addition , the elements of A and X should be single precision floating point numbers.

Output of program contain : The output measure three different scenario time: time taken copy data from CPU to GPU , time taken to compute by kernel , time taken to copy data from GPU to CPU.



This nvidia Gpu logo

  1. Parallel program with comment :
    • #include<stdio.h>

      __global__ void addition(float *dev_a , float size)
          long long id = blockIdx.x * blockDim.x +threadIdx.x;
          dev_a[id] = dev_a[id] + size ;

      void intializeVariable(float *a,float size);

      int main(){
          //define min and max 
          long long min = pow(2,24);
          long long max = pow(2,30); //127
          //declare variable 
          float *h_a;
          float *dev_a;    

          clock_t start , end;
          double walltime ;

          for(long long  size= min ;  size < max ; size= size *2){
            //initialize a memory of host variable
            h_a = (float*)malloc(size*sizeof(float));

            //initialize variable

            start = clock();

            //inialize cudamemory 
            printf("Problem size : %lld  \t",size);
            //transfer memory from cpu to gpu
              start = clock(); // measure cpu to gpu time
              end = clock();
              //calculate cpu to cpu transfer time
                 walltime =(end-start)/(double)CLOCKS_PER_SEC;
                 printf("CPU TO GPU %lf  \t",walltime);
            //kernel function
                start = clock(); 
                end = clock();
                //calculate time
                walltime =(end-start)/(double)CLOCKS_PER_SEC;
                printf("Kernal(ms) %lf  \t",walltime);
            //transfer gpu to cpu
              start = clock(); 
              end = clock();
              //calculate cpu to cpu transfer time
                walltime =(end-start)/(double)CLOCKS_PER_SEC;
                printf("CPU TO GPU %lf  \n",walltime);

          return 0;
      void intializeVariable(float *a,float size){
         long long i =0;
          for(i = 0;i<size;i++){
              a[i] = 1.00000;
  2.  Output (run in google co lab):
      Nvidia Gpu output in google colab

