CIS 451 Week 12

Data-Level parallelism

GPUs

  for (int i = 0; i < n; ++i) {
    y[i] = a*x[i]+y[i] 
  }
   __host__
    int nblocks = (n + 511) / 512;   // (We want something like (n / 512) + 1; but doesn't quite work.
   daxpy<<nblocks, 512>>(n, 2.0, x, y);


  __global__
  void daxpy(int n, double a, double *x, double *y) {
     int i = blockIdx.x * blockDim.y + threadIdx.x;
     if (i < n) y[i] = a*x[i] + y[i]
  }

GPU Instruction set

GPU Memory: