Facebook
From Crippled Armadillo, 4 Years ago, written in C.
Embed
Download Paste or View Raw
Hits: 227
  1. template <int BLOCK_SIZE> __global__ void
  2. matrixMulCUDA_3_1w2w(float* C, float* A, float* B, int wA, int wB)
  3. {
  4.         // Block index
  5.         int bx = blockIdx.x;
  6.         int by = blockIdx.y;
  7.         // Thread index
  8.         int tx = threadIdx.x;
  9.         int ty = threadIdx.y;
  10.         // Index of the first sub-matrix of A processed by the block
  11.         int aBegin = wA * BLOCK_SIZE * by;
  12.         // Index of the last sub-matrix of A processed by the block
  13.         int aEnd = aBegin + wA - 1;
  14.         // Step size used to iterate through the sub-matrices of A
  15.         int aStep = BLOCK_SIZE;
  16.         // Index of the first sub-matrix of B processed by the block
  17.         int bBegin = BLOCK_SIZE * bx;
  18.         // Step size used to iterate through the sub-matrices of B
  19.         int bStep = BLOCK_SIZE * wB;
  20.         // Csub is used to store the element of the block sub-matrix
  21.         // that is computed by the thread
  22.         float CSub[2] = { 0,0 };
  23.         // Loop over all the sub-matrices of A and B
  24.         // required to compute the block sub-matrix
  25.         for (int a = aBegin, b = bBegin;
  26.                 a <= aEnd;
  27.                 a += aStep, b += bStep)
  28.         {
  29.                 if (bx % 2 != 0)
  30.                 {
  31.                         break;
  32.                 }
  33.                 // printf("bx: %d, by: %d\n", bx, by);
  34.                 // Declaration of the shared memory array As used to
  35.                 // store the sub-matrix of A
  36.                 __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
  37.                 // Declaration of the shared memory array Bs used to
  38.                 // store the sub-matrix of B
  39.                 __shared__ float Bs[BLOCK_SIZE][2 * BLOCK_SIZE];
  40.                 // Load the matrices from device memory
  41.                 // to shared memory; each thread loads
  42.                 // 2 elements of each matrix
  43.                 As[ty][tx] = A[a + wA * ty + tx];
  44.                 Bs[ty][tx] = B[b + wB * ty + tx];
  45.                 Bs[ty][tx + BLOCK_SIZE] = B[b + wB * ty + tx + BLOCK_SIZE];
  46.                 // Synchronize to make sure the matrices are loaded
  47.                 __syncthreads();
  48.                 // Multiply the two matrices together;
  49.                 // each thread computes one element
  50.                 // of the block sub-matrix
  51. #pragma unroll
  52.                 for (int k = 0; k < BLOCK_SIZE; ++k)
  53.                 {
  54.                         CSub[0] += As[ty][k] * Bs[k][tx];
  55.                         CSub[1] += As[ty][k] * Bs[k][tx + BLOCK_SIZE];
  56.                 }
  57.  
  58.                 // Synchronize to make sure that the preceding
  59.                 // computation is done before loading two new
  60.                 // sub-matrices of A and B in the next iteration
  61.                 __syncthreads();
  62.         }
  63.         // Write the block sub-matrix to device memory;
  64.         // each thread writes one element
  65.         int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  66.         C[c + wB * ty + tx] = CSub[0];
  67.         C[c + wB * ty + tx + BLOCK_SIZE] = CSub[1];
  68. }

Replies to Untitled rss

Title Name Language When
Re: Untitled Emerald Peafowl c 4 Years ago.