Facebook
From Cream Iguana, 2 Years ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 107
  1. /**
  2.  * Copyright (c) 2019 MIT License by 6.172 Staff
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a copy
  5.  * of this software and associated documentation files (the "Software"), to
  6.  * deal in the Software without restriction, including without limitation the
  7.  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8.  * sell copies of the Software, and to permit persons to whom the Software is
  9.  * furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice shall be included in
  12.  * all copies or substantial portions of the Software.
  13.  *
  14.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17.  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20.  * IN THE SOFTWARE.
  21.  **/
  22.  
  23. #include <stdbool.h>
  24. #include <stdint.h>
  25. #include <stdlib.h>
  26. #include <stdio.h>
  27. #include <math.h>
  28.  
  29. #include "./fasttime.h"
  30.  
  31. // Definition of the floating-point element type of the matrices.
  32. typedef double el_t;
  33. typedef float vfloat_t __attribute__((__vector_size__(32)));
  34.  
  35. // Base case of the matrix multiplication.  This base case computes a
  36. // product between two size x size tiles of elements in A and B, and
  37. // stores the result in C.  This code assumes that matrices A, B, and
  38. // C are all in row-major order, and the variable row_length stores
  39. // the length of a row in the input matrices A, B, or C.
  40. void matmul_base(el_t *restrict C, const el_t *restrict A,
  41.                  const el_t *restrict B, int row_length, int size) {
  42.   // TODO: Modify this routine to implement the outer-product base
  43.   // case, using the GCC vector extension.
  44.   int s = sizeof(vfloat_t)/sizeof(float);
  45.   vfloat_t a_vec, b_vec, c_vec;
  46.   #pragma clang loop vectorize(enable) interleave(enable)
  47.   for (int k = 0; k < size; k+=s) {
  48.     for (int i = 0; i < size; ++i) {
  49.         for (int j = 0; j < size; ++j) {
  50.      
  51.         for (int e = 0; e < s; ++e){
  52.             a_vec[e] = A[i*row_length + k+e];
  53.             b_vec[e] = B[(k+e)*row_length + j];
  54.             c_vec[e]=0;
  55.         }
  56.         c_vec += a_vec * b_vec;
  57.         for (int e = 0; e < s; ++e){
  58.             C[i*row_length + j] += c_vec[e];
  59.         }
  60.       }
  61.     }
  62.   }
  63. }
  64.  
  65. // Helper method to check if two given floating-point values are close
  66. // enough.
  67. static bool close_enough(el_t x, el_t y) {
  68.   // Canonicalize the input
  69.   x = (x < 0) ? -x : x;
  70.   y = (y < 0) ? -y : y;
  71.   if (x < y) {
  72.     el_t tmp = x;
  73.     x = y;
  74.     y = tmp;
  75.   }
  76.  
  77.   el_t diff = x - y;
  78.   assert(diff >= 0 && "Invalid difference");
  79.   return (diff < 0.01) || (diff / x < 0.00001);
  80. }
  81.  
  82. // Check that the given n-by-n matrix C is the matrix product of the
  83. // n-by-n matrices A and B.  We prevent inlining on this function,
  84. // because it is not performance critical and preventing inlining
  85. // makes it easier to interpret perf results.
  86. __attribute__((noinline))
  87. static bool check_correctness(const el_t *restrict C, const el_t *restrict A,
  88.                               const el_t *restrict B, int n) {
  89.   bool passed = true;
  90.   fprintf(stderr, "Checking correctness.n");
  91.   // Multiply the matrices in the naive way.
  92.   el_t *Ctmp = (el_t *)malloc(n * n * sizeof(el_t));
  93.   for (int i = 0; i < n; ++i) {
  94.     for (int j = 0; j < n; ++j) {
  95.       Ctmp[i*n + j] = 0;
  96.     }
  97.   }
  98.   for (int i = 0; i < n; ++i) {
  99.     for (int j = 0; j < n; ++j) {
  100.       for (int k = 0; k < n; ++k) {
  101.         Ctmp[i*n + j] += A[i*n + k] * B[k*n + j];
  102.       }
  103.     }
  104.   }
  105.   for (int i = 0; i < n; ++i) {
  106.     for (int j = 0; j < n; ++j) {
  107.       if (!close_enough(C[i*n + j], Ctmp[i*n + j])) {
  108.         passed = false;
  109.         fprintf(stderr, "Unexpected value found in matrix product: "
  110.                 "  C[%d,%d] = %f, expected %fn",
  111.                 i, j, C[i*n + j], Ctmp[i*n + j]);
  112.       }
  113.     }
  114.   }
  115.   if (!passed)
  116.     fprintf(stderr, "FAILED correctness check.n");
  117.   else
  118.     fprintf(stderr, "PASSED correctness check.n");
  119.  
  120.   free(Ctmp);
  121.   return passed;
  122. }
  123.  
  124. int main(int argc, char *argv[]) {
  125.   // By default, operate on 1024x1024 matrices, but allow an argument
  126.   // to specify the log_2 of another square matrix size.
  127.   int lg_n = 10;
  128.   if (argc > 1)
  129.     lg_n = atoi(argv[1]);
  130.  
  131.   // Dimensions of the matrices.
  132.   const int n = 1 << lg_n;
  133.   // Size of the base case.
  134.   const int size = (n > 32) ? 32 : n;
  135.   assert(n % size == 0 &&
  136.          "Matrix size is not a multiple of the base-case size.");
  137.  
  138.   // Allocate the matrices.
  139.   el_t *A = (el_t *)malloc(n * n * sizeof(el_t));
  140.   el_t *B = (el_t *)malloc(n * n * sizeof(el_t));
  141.   el_t *C = (el_t *)malloc(n * n * sizeof(el_t));
  142.  
  143.   // Initialize the matrices.
  144.   for (int i = 0; i < n; ++i) {
  145.     for (int j = 0; j < n; ++j) {
  146.       A[i*n + j] = (el_t)rand() / (el_t)RAND_MAX;
  147.       B[i*n + j] = (el_t)rand() / (el_t)RAND_MAX;
  148.       C[i*n + j] = 0;
  149.     }
  150.   }
  151.  
  152.   // Multiply the matrices.
  153.   fasttime_t start = gettime();
  154.   for (int i = 0; i < n; i += size) {
  155.     for (int k = 0; k < n; k += size) {
  156.       for (int j = 0; j < n; j += size) {
  157.         matmul_base(&C[i*n + j], &A[i*n + k], &B[k*n + j], n, size);
  158.       }
  159.     }
  160.   }
  161.   fasttime_t end = gettime();
  162.  
  163.   // Report the running time.
  164.   double elapsed = tdiff(start, end);
  165.   printf("Running time: %f secn", elapsed);
  166.  
  167.   // Check the result.
  168.   bool passed = check_correctness(C, A, B, n);
  169.  
  170.   // Free the memory of the matrices.
  171.   free(A);
  172.   free(B);
  173.   free(C);
  174.  
  175.   // Return 0 if the matrix multiplcation succeeded, 1 otherwise.
  176.   return !passed;
  177. }