- /**
- * Copyright (c) 2019 MIT License by 6.172 Staff
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- **/
- #include <stdbool.h>
- #include <stdint.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <math.h>
- #include "./fasttime.h"
- // Definition of the floating-point element type of the matrices.
- typedef double el_t;
- typedef float vfloat_t __attribute__((__vector_size__(32)));
- // Base case of the matrix multiplication. This base case computes a
- // product between two size x size tiles of elements in A and B, and
- // stores the result in C. This code assumes that matrices A, B, and
- // C are all in row-major order, and the variable row_length stores
- // the length of a row in the input matrices A, B, or C.
- void matmul_base(el_t *restrict C, const el_t *restrict A,
- const el_t *restrict B, int row_length, int size) {
- // TODO: Modify this routine to implement the outer-product base
- // case, using the GCC vector extension.
- int s = sizeof(vfloat_t)/sizeof(float);
- vfloat_t a_vec, b_vec, c_vec;
- #pragma clang loop vectorize(enable) interleave(enable)
- for (int k = 0; k < size; k+=s) {
- for (int i = 0; i < size; ++i) {
- for (int j = 0; j < size; ++j) {
- for (int e = 0; e < s; ++e){
- a_vec[e] = A[i*row_length + k+e];
- b_vec[e] = B[(k+e)*row_length + j];
- c_vec[e]=0;
- }
- c_vec += a_vec * b_vec;
- for (int e = 0; e < s; ++e){
- C[i*row_length + j] += c_vec[e];
- }
- }
- }
- }
- }
- // Helper method to check if two given floating-point values are close
- // enough.
- static bool close_enough(el_t x, el_t y) {
- // Canonicalize the input
- x = (x < 0) ? -x : x;
- y = (y < 0) ? -y : y;
- if (x < y) {
- el_t tmp = x;
- x = y;
- y = tmp;
- }
- el_t diff = x - y;
- assert(diff >= 0 && "Invalid difference");
- return (diff < 0.01) || (diff / x < 0.00001);
- }
- // Check that the given n-by-n matrix C is the matrix product of the
- // n-by-n matrices A and B. We prevent inlining on this function,
- // because it is not performance critical and preventing inlining
- // makes it easier to interpret perf results.
- __attribute__((noinline))
- static bool check_correctness(const el_t *restrict C, const el_t *restrict A,
- const el_t *restrict B, int n) {
- bool passed = true;
- fprintf(stderr, "Checking correctness.n");
- // Multiply the matrices in the naive way.
- el_t *Ctmp = (el_t *)malloc(n * n * sizeof(el_t));
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- Ctmp[i*n + j] = 0;
- }
- }
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- for (int k = 0; k < n; ++k) {
- Ctmp[i*n + j] += A[i*n + k] * B[k*n + j];
- }
- }
- }
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- if (!close_enough(C[i*n + j], Ctmp[i*n + j])) {
- passed = false;
- fprintf(stderr, "Unexpected value found in matrix product: "
- " C[%d,%d] = %f, expected %fn",
- i, j, C[i*n + j], Ctmp[i*n + j]);
- }
- }
- }
- if (!passed)
- fprintf(stderr, "FAILED correctness check.n");
- else
- fprintf(stderr, "PASSED correctness check.n");
- free(Ctmp);
- return passed;
- }
- int main(int argc, char *argv[]) {
- // By default, operate on 1024x1024 matrices, but allow an argument
- // to specify the log_2 of another square matrix size.
- int lg_n = 10;
- if (argc > 1)
- lg_n = atoi(argv[1]);
- // Dimensions of the matrices.
- const int n = 1 << lg_n;
- // Size of the base case.
- const int size = (n > 32) ? 32 : n;
- assert(n % size == 0 &&
- "Matrix size is not a multiple of the base-case size.");
- // Allocate the matrices.
- el_t *A = (el_t *)malloc(n * n * sizeof(el_t));
- el_t *B = (el_t *)malloc(n * n * sizeof(el_t));
- el_t *C = (el_t *)malloc(n * n * sizeof(el_t));
- // Initialize the matrices.
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- A[i*n + j] = (el_t)rand() / (el_t)RAND_MAX;
- B[i*n + j] = (el_t)rand() / (el_t)RAND_MAX;
- C[i*n + j] = 0;
- }
- }
- // Multiply the matrices.
- fasttime_t start = gettime();
- for (int i = 0; i < n; i += size) {
- for (int k = 0; k < n; k += size) {
- for (int j = 0; j < n; j += size) {
- matmul_base(&C[i*n + j], &A[i*n + k], &B[k*n + j], n, size);
- }
- }
- }
- fasttime_t end = gettime();
- // Report the running time.
- double elapsed = tdiff(start, end);
- printf("Running time: %f secn", elapsed);
- // Check the result.
- bool passed = check_correctness(C, A, B, n);
- // Free the memory of the matrices.
- free(A);
- free(B);
- free(C);
- // Return 0 if the matrix multiplcation succeeded, 1 otherwise.
- return !passed;
- }