NERSC · hcq9102 · Dec 1, 2023 · Dec 2, 2023 · Dec 2, 2023 · Dec 4, 2023
diff --git a/apps/choleskey/CMakeLists.txt b/apps/choleskey/CMakeLists.txt
@@ -21,3 +21,11 @@ target_include_directories(
 #   choleskey_stdpar_snd
 #   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
 #           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
+
+#tiled cholesky
+add_executable(cholesky_tiled cholesky_tiled.cpp)
+target_link_libraries(cholesky_tiled PRIVATE hpcpp-core blas lapack -fortranlibs gfortran)
+target_include_directories(
+  cholesky_tiled
+  PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
+          ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS)
diff --git a/apps/choleskey/cholesky_tiled.cpp b/apps/choleskey/cholesky_tiled.cpp
@@ -0,0 +1,149 @@
+/*
+1. Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks.
+    references:
+    >Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53.
+    >Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.)
+
+2. Therefore, the first step is to implement tiled Cholesky decomposition algorithm.
+
+3. This file is to implement tiled Cholesky decomposition algorithm. 
+    reference the implementation from Intel open source project, hetero-streams which will no longer be maintained by Intel.
+    https://github.com/intel/hetero-streams/tree/master/ref_code/cholesky
+
+4. Additionally, include openblas library when build:
+   or $ export OPENBLAS_DIR=/openblas/path
+*/
+
+#include <bits/stdc++.h>
+#include <cblas.h>
+#include <lapacke.h>
+#include <omp.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "tiled_cholesky_help.hpp"
+
+using data_type = double;
+
+void tiled_cholesky(double* matrix_split[], const int tile_size, const int num_tiles, CBLAS_ORDER blasLay,
+                    const int lapackLay) {
+    unsigned int m, n, k;
+
+    for (k = 0; k < num_tiles; ++k) {
+        //POTRF
+        int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size);
+
+        for (m = k + 1; m < num_tiles; ++m) {
+            //DTRSM
+            cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
+                        matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size);
+        }
+
+        for (n = k + 1; n < num_tiles; ++n) {
+            //DSYRK
+            cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, matrix_split[n * num_tiles + k],
+                        tile_size, 1.0, matrix_split[n * num_tiles + n], tile_size);
+
+            for (m = n + 1; m < num_tiles; ++m) {
+                //DGEMM
+                cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
+                            matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size, 1.0,
+                            matrix_split[m * num_tiles + n], tile_size);
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+
+    // TODO : introduce args
+    int matrix_size, num_tiles, tile_size, tot_tiles;
+    matrix_size = 40;  // must be an input
+    num_tiles = 2;     // matrix size MUST be divisible
+    int verifycorrectness = 1;
+    bool layRow = true;
+
+    fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles);
+
+    // Check mat_size is divisible by num_tiles
+    if (matrix_size % num_tiles != 0) {
+        fmt::print("matrix size must be divisible by num_tiles.. aborting\n");
+        throw std::invalid_argument("Matrix size is not divisible by num_tiles");
+    }
+
+    if (matrix_size == 0) {
+        fmt::print(" 0 illegal input matrix_size \n");
+        std::exit(0);
+    }
+
+    tile_size = matrix_size / num_tiles;
+    tot_tiles = num_tiles * num_tiles;
+
+    double* A = new double[matrix_size * matrix_size];
+
+    // Allocate memory for tiled_cholesky for the full matrix
+    double* A_lower = new double[matrix_size * matrix_size];
+
+    // Allocate memory for MKL cholesky for the full matrix
+    double* A_MKL = new double[matrix_size * matrix_size];
+
+    // Memory allocation for tiled matrix
+    double** Asplit = new double*[tot_tiles];
+
+    for (int i = 0; i < tot_tiles; ++i) {
+        // Buffer per tile
+        Asplit[i] = new double[tile_size * tile_size];
+    }
+
+    //Generate a symmetric positve-definite matrix
+    A = generate_positiveDefinitionMatrix(matrix_size);
+
+    //printMatrix(A, mat_size_m);
+
+    CBLAS_ORDER blasLay;
+    int lapackLay;
+
+    if (layRow) {
+        blasLay = CblasRowMajor;
+        lapackLay = LAPACK_ROW_MAJOR;
+    } else {
+        blasLay = CblasColMajor;
+        lapackLay = LAPACK_COL_MAJOR;
+    }
+
+    //copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL)
+    std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(double));
+    std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(double));
+
+    //splits the input matrix into tiles
+    split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow);
+
+    //run the tiled Cholesky function
+    tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay);
+
+    //assembling seperated tiles back into full matrix
+    assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow);
+
+    //calling LAPACKE_dpotrf cholesky for verification and timing comparison
+    int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size);
+
+    if (verifycorrectness == 1) {
+        bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size);
+        if (res) {
+            fmt::print("Tiled Cholesky decomposition successful\n");
+        } else {
+            fmt::print("Tiled Cholesky decomposition failed\n");
+        }
+    }
+
+    //memory free
+    delete[] A;
+    delete[] A_lower;
+    delete[] A_MKL;
+    for (int i = 0; i < tot_tiles; ++i) {
+        delete[] Asplit[i];
+    }
+
+    return 0;
+}
diff --git a/apps/choleskey/tiled_cholesky_help.hpp b/apps/choleskey/tiled_cholesky_help.hpp
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <cblas.h>
+#include <lapacke.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include "commons.hpp"
+
+double* generate_positiveDefinitionMatrix(const size_t matrix_size) {
+    double* A_matrix = new double[matrix_size * matrix_size];
+    double* pd_matrix = (double*)malloc(matrix_size * matrix_size * sizeof(double));
+    unsigned int seeds = matrix_size;
+
+    // generate a random symmetric matrix
+    for (size_t row = 0; row < matrix_size; ++row) {
+        for (size_t col = row; col < matrix_size; ++col) {
+            A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] = (double)rand_r(&seeds) / RAND_MAX;
+        }
+    }
+    // compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix.
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix,
+                matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size);
+
+    // Adjust Diagonals
+    for (size_t row = 0; row < matrix_size; ++row) {
+        double diagonals = 1.0;  // from 1.0
+        for (size_t col = 0; col < matrix_size; ++col) {
+            diagonals += pd_matrix[row * matrix_size + col];
+        }
+        // Set the diag entry
+        pd_matrix[row * matrix_size + row] = diagonals;
+    }
+
+    delete[] A_matrix;
+    return pd_matrix;
+}
+
+void split_into_tiles(const double* matrix, double* matrix_split[], const int num_tiles, const int tile_size,
+                      const int size, bool layRow) {
+
+    int total_num_tiles = num_tiles * num_tiles;
+    int offset_tile;
+
+    //#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
+    for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) {
+        if (layRow) {
+            offset_tile =
+                int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size;
+        } else {
+            offset_tile =
+                int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size;
+        }
+
+        for (int i = 0; i < tile_size; ++i)
+            //#pragma simd
+            for (int j = 0; j < tile_size; ++j) {
+                matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j];
+            }
+    }
+}
+
+void assemble_tiles(double* matrix_split[], double* matrix, const int num_tiles, const int tile_size, const int size,
+                    bool layRow) {
+    int i_tile, j_tile, tile, i_local, j_local;
+    //#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \
+    schedule(auto)
+    for (int i = 0; i < size; ++i) {
+        i_local = int(i % tile_size);
+        i_tile = int(i / tile_size);
+        //#pragma simd private(j_tile, tile, j_local)
+        for (int j = 0; j < size; ++j) {
+            j_tile = int(j / tile_size);
+            if (layRow) {
+                tile = i_tile * num_tiles + j_tile;
+            } else {
+                tile = j_tile * num_tiles + i_tile;
+            }
+            j_local = int(j % tile_size);
+            matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local];
+        }
+    }
+}
+
+bool verify_results(const double* lower_res, const double* dporft_res, const int totalsize) {
+    bool res = true;
+    double diff;
+    for (int i = 0; i < totalsize; ++i) {
+        diff = dporft_res[i] - lower_res[i];
+        if (fabs(dporft_res[i]) > 1e-5) {
+            diff /= dporft_res[i];
+        }
+        diff = fabs(diff);
+        if (diff > 1.0e-5) {
+            fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]);
+            res = false;
+            break;
+        }
+    }
+    return res;
+}
+
+void printMatrix(const double* matrix, size_t matrix_size) {
+    for (size_t row = 0; row < matrix_size; ++row) {
+        for (size_t col = 0; col <= row; ++col) {
+            std::cout << matrix[row * matrix_size + col] << "\t";
+        }
+        std::cout << std::endl;
+    }
+}
+
+void print_mat_split(double* matrix_split[], int num_tiles, int tile_size) {
+    for (int itile = 0; itile < num_tiles * num_tiles; ++itile) {
+        fmt::print("Block {}:\n", itile);
+        for (int i = 0; i < tile_size; ++i) {
+            for (int j = 0; j < tile_size; ++j) {
+                fmt::print("{} ", matrix_split[itile][i * tile_size + j]);
+            }
+            fmt::print("\n");
+        }
+        fmt::print("\n");
+    }
+}