Xilinx
diff --git a/‎aie_kernels/aie2/mm.cc‎
Lines changed: 363 additions & 144 deletions b/‎aie_kernels/aie2/mm.cc‎
Lines changed: 363 additions & 144 deletions
diff --git a/‎aie_kernels/aie2p/mm.cc‎
Lines changed: 95 additions & 32 deletions b/‎aie_kernels/aie2p/mm.cc‎
Lines changed: 95 additions & 32 deletions
diff --git a/‎programming_examples/basic/matrix_multiplication/common.h‎
Lines changed: 18 additions & 6 deletions b/‎programming_examples/basic/matrix_multiplication/common.h‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎programming_examples/basic/matrix_multiplication/test.cpp‎
Lines changed: 3 additions & 2 deletions b/‎programming_examples/basic/matrix_multiplication/test.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎programming_examples/basic/matrix_multiplication/whole_array/Makefile‎
Lines changed: 18 additions & 2 deletions b/‎programming_examples/basic/matrix_multiplication/whole_array/Makefile‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_col_maj.lit‎ renamed to ‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_b_col_maj.lit‎ b/‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_col_maj.lit‎ renamed to ‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_b_col_maj.lit‎
diff --git a/‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_col_maj_iron.lit‎ renamed to ‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_b_col_maj_iron.lit‎ b/‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_col_maj_iron.lit‎ renamed to ‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_b_col_maj_iron.lit‎
diff --git a/‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_col_maj_placed.lit‎ renamed to ‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_b_col_maj_placed.lit‎ b/‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_col_maj_placed.lit‎ renamed to ‎programming_examples/basic/matrix_multiplication/whole_array/tests/run_makefile_b_col_maj_placed.lit‎
@@ -20,16 +20,30 @@
 
 #include "zero.cc"
 
-template <typename T_in, typename T_out, int rowA, int colA, int colB>
+template <typename T_in, typename T_out, int rowA, int colA, int colB,
+          bool b_row_maj = true, bool c_row_maj = true>
 static inline void matmul_scalar(T_in *a, T_in *b, T_out *c) {
   event0();
   for (int row = 0; row < rowA; row++) {
     for (int col = 0; col < colB; col++) {
       T_out running_sum = 0;
       for (int i = 0; i < colA; i++) {
-        running_sum += a[row * colA + i] * b[i * colB + col];
+        T_in a_val = a[row * colA + i];
+        T_in b_val;
+        if constexpr (b_row_maj) {
+          b_val = b[i * colB + col];
+        } else {
+          b_val = b[i + col * colA];
+        }
+        running_sum += a_val * b_val;
+      }
+      T_out *c_ptr;
+      if constexpr (c_row_maj) {
+        c_ptr = &c[row * colB + col];
+      } else {
+        c_ptr = &c[row + col * rowA];
       }
-      c[row * colB + col] += running_sum;
+      *c_ptr += running_sum;
     }
   }
   event1();
@@ -65,7 +79,7 @@ static inline void matmul_scalar(T_in *a, T_in *b, T_out *c) {
  */
 template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
           unsigned colB, unsigned r, unsigned s, unsigned t,
-          bool b_row_maj = true>
+          bool b_row_maj = true, bool c_row_maj = true>
 static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA,
                                               const T_in *__restrict pB,
                                               T_out *__restrict pC) {
@@ -76,14 +90,24 @@ static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA,
 
   for (unsigned z = 0; z < rowA; z += 2)
     chess_prepare_for_pipelining chess_loop_range(4, ) {
-      T_out *__restrict pC1 = pC + (z * colB) * MMUL::size_C;
-      T_out *__restrict pC2 = pC + ((z + 1) * colB) * MMUL::size_C;
+
+      T_out *__restrict pC1;
+      T_out *__restrict pC2;
+      if constexpr (c_row_maj) {
+        pC1 = pC + (z * colB) * MMUL::size_C;
+        pC2 = pC + ((z + 1) * colB) * MMUL::size_C;
+      }
 
       for (unsigned j = 0; j < colB; j += 2)
 #ifdef OPT_PERF_ENABLED
         chess_flatten_loop
 #endif
         {
+
+          if constexpr (!c_row_maj) {
+            pC1 = pC + j * rowA * MMUL::size_C + z * MMUL::size_C;
+            pC2 = pC + (j + 1) * rowA * MMUL::size_C + z * MMUL::size_C;
+          }
           const T_in *__restrict pA1 = pA + (z * colA) * MMUL::size_A;
           const T_in *__restrict pA2 = pA + ((z + 1) * colA) * MMUL::size_A;
           const T_in *__restrict pB1;
@@ -95,6 +119,7 @@ static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA,
             pB1 = pB + (j * colA) * MMUL::size_B;
             pB2 = pB + ((j + 1) * colA) * MMUL::size_B;
           }
+
           aie::vector<T_in, MMUL::size_A> A0;
           aie::vector<T_in, MMUL::size_A> A1;
           aie::vector<T_in, MMUL::size_B> B0;
@@ -103,14 +128,23 @@ static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA,
           // Load partial results from C buffer for accumulation in-place. The
           // zero.cc function handles the zeroing of data when a new
           // accumulation is needed (after the 'K' reduction dimension)
-          aie::vector<T_out, MMUL::size_C> acc_C00 =
-              aie::load_v<MMUL::size_C>(pC1);
-          aie::vector<T_out, MMUL::size_C> acc_C01 =
-              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
-          aie::vector<T_out, MMUL::size_C> acc_C10 =
-              aie::load_v<MMUL::size_C>(pC2);
-          aie::vector<T_out, MMUL::size_C> acc_C11 =
-              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C00;
+          aie::vector<T_out, MMUL::size_C> acc_C01;
+          aie::vector<T_out, MMUL::size_C> acc_C10;
+          aie::vector<T_out, MMUL::size_C> acc_C11;
+          if constexpr (c_row_maj) {
+            acc_C00 = aie::load_v<MMUL::size_C>(pC1);
+            acc_C01 = aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
+            acc_C10 = aie::load_v<MMUL::size_C>(pC2);
+            acc_C11 = aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
+          } else {
+            acc_C00 = aie::transpose(aie::load_v<MMUL::size_C>(pC1), t, r);
+            acc_C01 = aie::transpose(aie::load_v<MMUL::size_C>(pC2), t, r);
+            acc_C10 = aie::transpose(
+                aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C), t, r);
+            acc_C11 = aie::transpose(
+                aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C), t, r);
+          }
 
           MMUL C00(acc_C00);
           MMUL C01(acc_C01);
@@ -149,14 +183,30 @@ static inline void matmul_vectorized_2x2_mmul(const T_in *__restrict pA,
           // example below shows how to shift right 10 bits
           // #define SHIFT 10
           // aie::store_v(pC1, C00.template to_vector<T_out>(SHIFT));
-          aie::store_v(pC1, C00.template to_vector<T_out>());
-          pC1 += MMUL::size_C;
-          aie::store_v(pC1, C01.template to_vector<T_out>());
-          pC1 += MMUL::size_C;
-          aie::store_v(pC2, C10.template to_vector<T_out>());
-          pC2 += MMUL::size_C;
-          aie::store_v(pC2, C11.template to_vector<T_out>());
-          pC2 += MMUL::size_C;
+
+          if constexpr (c_row_maj) {
+            aie::store_v(pC1, C00.template to_vector<T_out>());
+            pC1 += MMUL::size_C;
+            aie::store_v(pC1, C01.template to_vector<T_out>());
+            pC1 += MMUL::size_C;
+            aie::store_v(pC2, C10.template to_vector<T_out>());
+            pC2 += MMUL::size_C;
+            aie::store_v(pC2, C11.template to_vector<T_out>());
+            pC2 += MMUL::size_C;
+          } else {
+            aie::store_v(pC1,
+                         aie::transpose(C00.template to_vector<T_out>(), r, t));
+            pC1 += MMUL::size_C;
+            aie::store_v(pC2,
+                         aie::transpose(C01.template to_vector<T_out>(), r, t));
+            pC2 += MMUL::size_C;
+            aie::store_v(pC1,
+                         aie::transpose(C10.template to_vector<T_out>(), r, t));
+            pC1 += MMUL::size_C;
+            aie::store_v(pC2,
+                         aie::transpose(C11.template to_vector<T_out>(), r, t));
+            pC2 += MMUL::size_C;
+          }
         }
     }
 
@@ -169,6 +219,12 @@ constexpr bool is_b_row_maj = false;
 constexpr bool is_b_row_maj = true;
 #endif
 
+#ifdef C_COL_MAJ
+constexpr bool is_c_row_maj = false;
+#else
+constexpr bool is_c_row_maj = true;
+#endif
+
 // The following kernel definitions use mmul shapes that have been found to be
 // optimal for AIE2P in combination with the 2x2 mmul expanded kernel.
 //
@@ -195,7 +251,8 @@ static inline void matmul_vectorized_4x4x8_i16_i16(const int16 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<int16, int16, (m / r), (k / s), (n / t), r,
-                                    s, t, is_b_row_maj>(pA, pB, pC);
+                                    s, t, is_b_row_maj, is_c_row_maj>(pA, pB,
+                                                                      pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -211,7 +268,8 @@ static inline void matmul_vectorized_4x4x8_i16_i32(const int16 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<int16, int32, (m / r), (k / s), (n / t), r,
-                                    s, t, is_b_row_maj>(pA, pB, pC);
+                                    s, t, is_b_row_maj, is_c_row_maj>(pA, pB,
+                                                                      pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -228,7 +286,8 @@ matmul_vectorized_4x8x8_bf16_bf16(const bfloat16 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<bfloat16, bfloat16, (m / r), (k / s),
-                                    (n / t), r, s, t, is_b_row_maj>(pA, pB, pC);
+                                    (n / t), r, s, t, is_b_row_maj,
+                                    is_c_row_maj>(pA, pB, pC);
 }
 
 // Note that this shape is only possible for bf16 when using bfp16 emulation
@@ -247,7 +306,8 @@ matmul_vectorized_8x8x8_bf16_bf16(const bfloat16 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<bfloat16, bfloat16, (m / r), (k / s),
-                                    (n / t), r, s, t, is_b_row_maj>(pA, pB, pC);
+                                    (n / t), r, s, t, is_b_row_maj,
+                                    is_c_row_maj>(pA, pB, pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -264,7 +324,8 @@ matmul_vectorized_4x8x8_bf16_f32(const bfloat16 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<bfloat16, float, (m / r), (k / s), (n / t),
-                                    r, s, t, is_b_row_maj>(pA, pB, pC);
+                                    r, s, t, is_b_row_maj, is_c_row_maj>(pA, pB,
+                                                                         pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -281,7 +342,7 @@ matmul_vectorized_8x8x8_bf16_f32(const bfloat16 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<bfloat16, float, (m / r), (k / s), (n / t),
-                                    r, s, t, is_b_row_maj>(pA, pB, pC);
+                                    r, s, t, is_b_row_maj, is_c_row_maj>(pA, pB, pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -297,7 +358,7 @@ static inline void matmul_vectorized_8x8x8_i8_i8(const int8 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<int8, int8, (m / r), (k / s), (n / t), r, s,
-                                    t, is_b_row_maj>(pA, pB, pC);
+                                    t, is_b_row_maj, is_c_row_maj>(pA, pB, pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -313,7 +374,8 @@ static inline void matmul_vectorized_8x8x8_i8_i16(const int8 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<int8, int16, (m / r), (k / s), (n / t), r,
-                                    s, t, is_b_row_maj>(pA, pB, pC);
+                                    s, t, is_b_row_maj, is_c_row_maj>(pA, pB,
+                                                                      pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
@@ -329,7 +391,8 @@ static inline void matmul_vectorized_8x8x8_i8_i32(const int8 *__restrict pA,
   static_assert(n % (2 * t) == 0);
 
   return matmul_vectorized_2x2_mmul<int8, int32, (m / r), (k / s), (n / t), r,
-                                    s, t, is_b_row_maj>(pA, pB, pC);
+                                    s, t, is_b_row_maj, is_c_row_maj>(pA, pB,
+                                                                      pC);
 }
 
 extern "C" {
@@ -418,7 +481,7 @@ extern "C" {
                              r, s, t)                                          \
   void matmul_scalar_##mlir_type_in##_##mlir_type_out(                         \
       ctype_in *a_in, ctype_in *b_in, ctype_out *c_out) {                      \
-    matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N>(a_in, b_in,        \
+    matmul_scalar<ctype_in, ctype_out, DIM_M, DIM_K, DIM_N, is_b_row_maj, is_c_row_maj>(a_in, b_in,        \
                                                             c_out);            \
   }
 
 
@@ -55,6 +55,8 @@ void add_default_options(cxxopts::Options &options) {
       "trace_file", "where to store trace output",
       cxxopts::value<std::string>()->default_value("trace.txt"))(
       "b_col_maj", "Is B matrix in colum-major format?",
+      cxxopts::value<int>()->default_value("0"))(
+      "c_col_maj", "Is C matrix in colum-major format?",
       cxxopts::value<int>()->default_value("0"));
 }
 
@@ -109,7 +111,7 @@ std::bfloat16_t get_random<std::bfloat16_t>() {
 
 template <typename Tin, typename Tout, typename Tacc>
 void matmul(int M, int N, int K, const std::vector<Tin> A,
-            const std::vector<Tin> B, std::vector<Tout> &C, int b_col_maj) {
+            const std::vector<Tin> B, std::vector<Tout> &C, int b_col_maj, int c_col_maj) {
   for (int row = 0; row < M; row++) {
     for (int col = 0; col < N; col++) {
       Tacc running_sum = 0;
@@ -120,7 +122,11 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
           running_sum += Tacc(A[row * K + k] * B[k + col * K]);
         }
       }
-      C[row * N + col] = Tout(running_sum);
+      if (!c_col_maj) {
+        C[row * N + col] = Tout(running_sum);
+      } else {
+        C[row + col * M] = Tout(running_sum);
+      }
     }
   }
 }
@@ -347,14 +353,14 @@ void print_progress_bar(std::ostream &os, double progress, int len = 75) {
 template <typename Tin, typename Tout, typename Tacc>
 int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
            std::vector<Tout> C, int verbosity = 0, float abs_tol = 0.5,
-           float rel_tol = 0.05, int b_col_maj = 0) {
+           float rel_tol = 0.05, int b_col_maj = 0, int c_col_maj = 0) {
   int n_errors = 0;
   std::vector<struct error<Tout>> errors;
   Tout max_rel_error = (Tout)0.0f;
   struct error<Tout> max_error;
 
   std::vector<Tout> CRef(M * N);
-  matmul<Tin, Tout, Tacc>(M, N, K, A, B, CRef, b_col_maj);
+  matmul<Tin, Tout, Tacc>(M, N, K, A, B, CRef, b_col_maj, c_col_maj);
 
   for (int row = 0; row < M; row++) {
     for (int col = 0; col < N; col++) {
@@ -394,7 +400,7 @@ template <typename Tin, typename Tout, typename Tacc>
 int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
                       std::vector<Tin> B, std::vector<Tout> C, int n_samples,
                       int verbosity = 0, float abs_tol = 0.5,
-                      float rel_tol = 0.05, int b_col_maj = 0) {
+                      float rel_tol = 0.05, int b_col_maj = 0, int c_col_maj = 0) {
   std::mt19937 rng;
   auto rows = std::views::iota(0, M);
   auto cols = std::views::iota(0, N);
@@ -420,8 +426,14 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
       print_progress_bar(std::cerr, progress);
     }
     Tout ref = mul_acc<Tin, Tout, Tacc>(M, N, K, row, col, A, B, b_col_maj);
+    Tout observed;
+    if (!c_col_maj) {
+      observed = C[row * N + col];
+    } else {
+      observed = C[row + col * M];
+    }
     std::optional<struct error<Tout>> error = verify_single(
-        std::cout, row, col, ref, C[row * N + col], abs_tol, rel_tol);
+        std::cout, row, col, ref, observed, abs_tol, rel_tol);
     if (error.has_value()) {
       if (n_errors < max_printable_errors) {
         errors.push_back(*error);
 
@@ -67,6 +67,7 @@ int main(int argc, const char *argv[]) {
   int n_warmup_iterations = vm["warmup"].as<int>();
   int trace_size = vm["trace_sz"].as<int>();
   int b_col_maj = vm["b_col_maj"].as<int>();
+  int c_col_maj = vm["c_col_maj"].as<int>();
 
   // Fix the seed to ensure reproducibility in CI.
   srand(1726250518); // srand(time(NULL));
@@ -255,10 +256,10 @@ int main(int argc, const char *argv[]) {
         errors = matmul_common::verify_stochastic<A_DATATYPE, C_DATATYPE,
                                                   ACC_DATATYPE>(
             M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity,
-            abs_tol, rel_tol, b_col_maj);
+            abs_tol, rel_tol, b_col_maj, c_col_maj);
       } else {
         errors = matmul_common::verify<A_DATATYPE, C_DATATYPE, ACC_DATATYPE>(
-            M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol, b_col_maj);
+            M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol, b_col_maj, c_col_maj);
       }
       auto vstop = std::chrono::system_clock::now();
       float vtime =
 
@@ -24,13 +24,20 @@ n?=32
 
 n_aie_cols?=4
 b_col_maj?=0
+c_col_maj?=0
 
 kernels=mm_${m}x${k}x${n}
-aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols} --b-col-maj ${b_col_maj}
-runargs+=--b_col_maj ${b_col_maj}
+aieargs+=-m $m -k $k -n $n --n-aie-cols ${n_aie_cols} --b-col-maj ${b_col_maj} 
+runargs+=--b_col_maj ${b_col_maj} 
 target_suffix=${M}x${K}x${N}_${m}x${k}x${n}_${n_aie_cols}c
 use_placed?=0
 use_iron?=0
+use_scalar?=0
+
+ifeq (${c_col_maj}, 1)
+aieargs+=--c-col-maj ${c_col_maj} 
+runargs+=--c_col_maj ${c_col_maj}
+endif
 
 # set this flag to 1 for linear buffer allocation
 # else, 0 for bank-aware
@@ -57,6 +64,15 @@ KERNEL_DEFINES=-D${dtype_in}_${dtype_out}_ONLY -DDIM_M=${m} -DDIM_K=${k} -DDIM_N
 ifeq (${b_col_maj}, 1)
 	KERNEL_DEFINES+=-DB_COL_MAJ
 endif
+ifeq (${c_col_maj}, 1)
+	KERNEL_DEFINES+=-DC_COL_MAJ
+endif
+ifeq (${use_scalar}, 1)
+	KERNEL_DEFINES+=-DSCALAR_ONLY
+	aieargs+= --scalar 1
+else
+	KERNEL_DEFINES+=-DVECTORIZED_ONLY
+endif
 
 include ${srcdir}/../makefile-common