Cuda Performance for large size problems



Article contains :

    1). Parallel program

    2) Output  

Problem statement : Adds a number X to all elements of a one-dimensional array A. In addition , the elements of A and X should be single precision floating point numbers.

Output of program contain : The output measure three different scenario time: time taken copy data from CPU to GPU , time taken to compute by kernel , time taken to copy data from GPU to CPU.

 

 

This nvidia Gpu logo

  1. Parallel program with comment :
    • #include<stdio.h>
      #include<math.h>
      #include<time.h>
      #include<cuda.h>


      __global__ void addition(float *dev_a , float size)
      {
          long long id = blockIdx.x * blockDim.x +threadIdx.x;
          
          dev_a[id] = dev_a[id] + size ;

      }
      void intializeVariable(float *a,float size);

      int main(){
          //define min and max 
          
          long long min = pow(2,24);
          long long max = pow(2,30); //127
          
          //declare variable 
          float *h_a;
          float *dev_a;    

          clock_t start , end;
          double walltime ;

          for(long long  size= min ;  size < max ; size= size *2){
              
            //initialize a memory of host variable
            h_a = (float*)malloc(size*sizeof(float));

            //initialize variable
            intializeVariable(h_a,size);

            start = clock();

            //inialize cudamemory 
            
            cudaMalloc((void**)&dev_a,size*sizeof(float));
            
            
            printf("Problem size : %lld  \t",size);
             
            //transfer memory from cpu to gpu
              start = clock(); // measure cpu to gpu time
                cudaMemcpy(dev_a,h_a,size,cudaMemcpyHostToDevice);
              end = clock();
              //calculate cpu to cpu transfer time
                 walltime =(end-start)/(double)CLOCKS_PER_SEC;
                 printf("CPU TO GPU %lf  \t",walltime);
          
            //kernel function
                start = clock(); 
                addition<<<4,ceil(size/4)>>>(dev_a,size);
                end = clock();
                //calculate time
                walltime =(end-start)/(double)CLOCKS_PER_SEC;
                printf("Kernal(ms) %lf  \t",walltime);
          
            //transfer gpu to cpu
              start = clock(); 
              cudaMemcpy(h_a,dev_a,size,cudaMemcpyDeviceToHost);
              end = clock();
              //calculate cpu to cpu transfer time
                walltime =(end-start)/(double)CLOCKS_PER_SEC;
                printf("CPU TO GPU %lf  \n",walltime);
          
          
            
            cudaFree(dev_a);
            free(h_a);

        }
          return 0;
      }
      void intializeVariable(float *a,float size){
         long long i =0;
          for(i = 0;i<size;i++){
              a[i] = 1.00000;
          }
      }
       
  2.  Output (run in google co lab):
    •  
      Nvidia Gpu output in google colab

       

Comments