/** * Copyright (c) 2019 MIT License by 6.172 Staff * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. **/ #include #include #include #include #include #include "./fasttime.h" // Definition of the floating-point element type of the matrices. typedef double el_t; typedef float vfloat_t __attribute__((__vector_size__(32))); // Base case of the matrix multiplication. This base case computes a // product between two size x size tiles of elements in A and B, and // stores the result in C. This code assumes that matrices A, B, and // C are all in row-major order, and the variable row_length stores // the length of a row in the input matrices A, B, or C. void matmul_base(el_t *restrict C, const el_t *restrict A, const el_t *restrict B, int row_length, int size) { // TODO: Modify this routine to implement the outer-product base // case, using the GCC vector extension. int s = sizeof(vfloat_t)/sizeof(float); vfloat_t a_vec, b_vec, c_vec; #pragma clang loop vectorize(enable) interleave(enable) for (int k = 0; k < size; k+=s) { for (int i = 0; i < size; ++i) { for (int j = 0; j < size; ++j) { for (int e = 0; e < s; ++e){ a_vec[e] = A[i*row_length + k+e]; b_vec[e] = B[(k+e)*row_length + j]; c_vec[e]=0; } c_vec += a_vec * b_vec; for (int e = 0; e < s; ++e){ C[i*row_length + j] += c_vec[e]; } } } } } // Helper method to check if two given floating-point values are close // enough. static bool close_enough(el_t x, el_t y) { // Canonicalize the input x = (x < 0) ? -x : x; y = (y < 0) ? -y : y; if (x < y) { el_t tmp = x; x = y; y = tmp; } el_t diff = x - y; assert(diff >= 0 && "Invalid difference"); return (diff < 0.01) || (diff / x < 0.00001); } // Check that the given n-by-n matrix C is the matrix product of the // n-by-n matrices A and B. We prevent inlining on this function, // because it is not performance critical and preventing inlining // makes it easier to interpret perf results. __attribute__((noinline)) static bool check_correctness(const el_t *restrict C, const el_t *restrict A, const el_t *restrict B, int n) { bool passed = true; fprintf(stderr, "Checking correctness.n"); // Multiply the matrices in the naive way. el_t *Ctmp = (el_t *)malloc(n * n * sizeof(el_t)); for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { Ctmp[i*n + j] = 0; } } for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { for (int k = 0; k < n; ++k) { Ctmp[i*n + j] += A[i*n + k] * B[k*n + j]; } } } for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { if (!close_enough(C[i*n + j], Ctmp[i*n + j])) { passed = false; fprintf(stderr, "Unexpected value found in matrix product: " " C[%d,%d] = %f, expected %fn", i, j, C[i*n + j], Ctmp[i*n + j]); } } } if (!passed) fprintf(stderr, "FAILED correctness check.n"); else fprintf(stderr, "PASSED correctness check.n"); free(Ctmp); return passed; } int main(int argc, char *argv[]) { // By default, operate on 1024x1024 matrices, but allow an argument // to specify the log_2 of another square matrix size. int lg_n = 10; if (argc > 1) lg_n = atoi(argv[1]); // Dimensions of the matrices. const int n = 1 << lg_n; // Size of the base case. const int size = (n > 32) ? 32 : n; assert(n % size == 0 && "Matrix size is not a multiple of the base-case size."); // Allocate the matrices. el_t *A = (el_t *)malloc(n * n * sizeof(el_t)); el_t *B = (el_t *)malloc(n * n * sizeof(el_t)); el_t *C = (el_t *)malloc(n * n * sizeof(el_t)); // Initialize the matrices. for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { A[i*n + j] = (el_t)rand() / (el_t)RAND_MAX; B[i*n + j] = (el_t)rand() / (el_t)RAND_MAX; C[i*n + j] = 0; } } // Multiply the matrices. fasttime_t start = gettime(); for (int i = 0; i < n; i += size) { for (int k = 0; k < n; k += size) { for (int j = 0; j < n; j += size) { matmul_base(&C[i*n + j], &A[i*n + k], &B[k*n + j], n, size); } } } fasttime_t end = gettime(); // Report the running time. double elapsed = tdiff(start, end); printf("Running time: %f secn", elapsed); // Check the result. bool passed = check_correctness(C, A, B, n); // Free the memory of the matrices. free(A); free(B); free(C); // Return 0 if the matrix multiplcation succeeded, 1 otherwise. return !passed; }