Cuda Performance for large size problems

Article contains :

1). Parallel program

2) Output

Problem statement : Adds a number X to all elements of a one-dimensional array A. In addition , the elements of A and X should be single precision floating point numbers.
Output of program contain : The output measure three different scenario time: time taken copy data from CPU to GPU , time taken to compute by kernel , time taken to copy data from GPU to CPU.

Parallel program with comment :

#include<stdio.h>
#include<math.h>
#include<time.h>
#include<cuda.h>

__global__ void addition(float *dev_a , float size)
{
    long long id = blockIdx.x * blockDim.x +threadIdx.x;

    dev_a[id] = dev_a[id] + size ;

}
void intializeVariable(float *a,float size);

int main(){
    //define min and max

    long long min = pow(2,24);
    long long max = pow(2,30); //127

    //declare variable
    float *h_a;
    float *dev_a;

    clock_t start , end;
    double walltime ;

    for(long long  size= min ;  size < max ; size= size *2){

      //initialize a memory of host variable
      h_a = (float*)malloc(size*sizeof(float));

      //initialize variable
      intializeVariable(h_a,size);

      start = clock();

      //inialize cudamemory

      cudaMalloc((void**)&dev_a,size*sizeof(float));


      printf("Problem size : %lld  \t",size);

      //transfer memory from cpu to gpu
        start = clock(); // measure cpu to gpu time
          cudaMemcpy(dev_a,h_a,size,cudaMemcpyHostToDevice);
        end = clock();
        //calculate cpu to cpu transfer time
           walltime =(end-start)/(double)CLOCKS_PER_SEC;
           printf("CPU TO GPU %lf  \t",walltime);

      //kernel function
          start = clock();
          addition<<<4,ceil(size/4)>>>(dev_a,size);
          end = clock();
          //calculate time
          walltime =(end-start)/(double)CLOCKS_PER_SEC;
          printf("Kernal(ms) %lf  \t",walltime);

      //transfer gpu to cpu
        start = clock();
        cudaMemcpy(h_a,dev_a,size,cudaMemcpyDeviceToHost);
        end = clock();
        //calculate cpu to cpu transfer time
          walltime =(end-start)/(double)CLOCKS_PER_SEC;
          printf("CPU TO GPU %lf  \n",walltime);



      cudaFree(dev_a);
      free(h_a);

  }
    return 0;
}
void intializeVariable(float *a,float size){
   long long i =0;
    for(i = 0;i<size;i++){
        a[i] = 1.00000;
    }
}

Output (run in google co lab):

BitCoding

Search This Blog

Now Access to Gemini (Google) : Really beat to ChatGPT ?

Cuda Performance for large size problems

Article contains :

1). Parallel program

2) Output

Problem statement : Adds a number X to all elements of a one-dimensional array A. In addition , the elements of A and X should be single precision floating point numbers.

Output of program contain : The output measure three different scenario time: time taken copy data from CPU to GPU , time taken to compute by kernel , time taken to copy data from GPU to CPU.

Comments

Post a Comment