/**
 * Copyright (c) 2019 MIT License by 6.172 Staff
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 **/

#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include "./fasttime.h"

// Definition of the floating-point element type of the matrices.
typedef double el_t;
typedef float vfloat_t __attribute__((__vector_size__(32)));

// Base case of the matrix multiplication.  This base case computes a
// product between two size x size tiles of elements in A and B, and
// stores the result in C.  This code assumes that matrices A, B, and
// C are all in row-major order, and the variable row_length stores
// the length of a row in the input matrices A, B, or C.
void matmul_base(el_t *restrict C, const el_t *restrict A,
                 const el_t *restrict B, int row_length, int size) {
  // TODO: Modify this routine to implement the outer-product base
  // case, using the GCC vector extension.
  int s = sizeof(vfloat_t)/sizeof(float);
  vfloat_t a_vec, b_vec, c_vec;
  #pragma clang loop vectorize(enable) interleave(enable)
  for (int k = 0; k < size; k+=s) {
    for (int i = 0; i < size; ++i) {
        for (int j = 0; j < size; ++j) {
      
        for (int e = 0; e < s; ++e){
            a_vec[e] = A[i*row_length + k+e];
            b_vec[e] = B[(k+e)*row_length + j];
            c_vec[e]=0;
        }
        c_vec += a_vec * b_vec;
        for (int e = 0; e < s; ++e){
            C[i*row_length + j] += c_vec[e];
        }
      }
    }
  }
}

// Helper method to check if two given floating-point values are close
// enough.
static bool close_enough(el_t x, el_t y) {
  // Canonicalize the input
  x = (x < 0) ? -x : x;
  y = (y < 0) ? -y : y;
  if (x < y) {
    el_t tmp = x;
    x = y;
    y = tmp;
  }

  el_t diff = x - y;
  assert(diff >= 0 && "Invalid difference");
  return (diff < 0.01) || (diff / x < 0.00001);
}

// Check that the given n-by-n matrix C is the matrix product of the
// n-by-n matrices A and B.  We prevent inlining on this function,
// because it is not performance critical and preventing inlining
// makes it easier to interpret perf results.
__attribute__((noinline))
static bool check_correctness(const el_t *restrict C, const el_t *restrict A,
                              const el_t *restrict B, int n) {
  bool passed = true;
  fprintf(stderr, "Checking correctness.n");
  // Multiply the matrices in the naive way.
  el_t *Ctmp = (el_t *)malloc(n * n * sizeof(el_t));
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      Ctmp[i*n + j] = 0;
    }
  }
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      for (int k = 0; k < n; ++k) {
        Ctmp[i*n + j] += A[i*n + k] * B[k*n + j];
      }
    }
  }
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      if (!close_enough(C[i*n + j], Ctmp[i*n + j])) {
        passed = false;
        fprintf(stderr, "Unexpected value found in matrix product: "
                "  C[%d,%d] = %f, expected %fn",
                i, j, C[i*n + j], Ctmp[i*n + j]);
      }
    }
  }
  if (!passed)
    fprintf(stderr, "FAILED correctness check.n");
  else
    fprintf(stderr, "PASSED correctness check.n");

  free(Ctmp);
  return passed;
}

int main(int argc, char *argv[]) {
  // By default, operate on 1024x1024 matrices, but allow an argument
  // to specify the log_2 of another square matrix size.
  int lg_n = 10;
  if (argc > 1)
    lg_n = atoi(argv[1]);

  // Dimensions of the matrices.
  const int n = 1 << lg_n;
  // Size of the base case.
  const int size = (n > 32) ? 32 : n;
  assert(n % size == 0 &&
         "Matrix size is not a multiple of the base-case size.");

  // Allocate the matrices.
  el_t *A = (el_t *)malloc(n * n * sizeof(el_t));
  el_t *B = (el_t *)malloc(n * n * sizeof(el_t));
  el_t *C = (el_t *)malloc(n * n * sizeof(el_t));

  // Initialize the matrices.
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < n; ++j) {
      A[i*n + j] = (el_t)rand() / (el_t)RAND_MAX;
      B[i*n + j] = (el_t)rand() / (el_t)RAND_MAX;
      C[i*n + j] = 0;
    }
  }

  // Multiply the matrices.
  fasttime_t start = gettime();
  for (int i = 0; i < n; i += size) {
    for (int k = 0; k < n; k += size) {
      for (int j = 0; j < n; j += size) {
        matmul_base(&C[i*n + j], &A[i*n + k], &B[k*n + j], n, size);
      }
    }
  }
  fasttime_t end = gettime();

  // Report the running time.
  double elapsed = tdiff(start, end);
  printf("Running time: %f secn", elapsed);

  // Check the result.
  bool passed = check_correctness(C, A, B, n);

  // Free the memory of the matrices.
  free(A);
  free(B);
  free(C);

  // Return 0 if the matrix multiplcation succeeded, 1 otherwise.
  return !passed;
}