Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tiled_cholesky w/o openmp #27

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions apps/choleskey/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ target_include_directories(
# choleskey_stdpar_snd
# PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
# ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})

#tiled cholesky
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
add_executable(cholesky_tiled cholesky_tiled.cpp)
target_link_libraries(cholesky_tiled PRIVATE hpcpp-core blas lapack -fortranlibs gfortran)
target_include_directories(
cholesky_tiled
PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR} ${OPENBLAS_DIR}/include/openblas ${OPENBLAS_DIR}/lib64/cmake/OpenBLAS)
149 changes: 149 additions & 0 deletions apps/choleskey/cholesky_tiled.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
/*
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
1. Aim to implememt task-graph based Cholesky decomposition: tiled Cholesky decomposition with OpenMP tasks.
references:
>Buttari, Alfredo, et al. "A class of parallel tiled linear algebra algorithms for multicore architectures." Parallel Computing 35.1 (2009): 38-53.
>Dorris, Joseph, et al. "Task-based Cholesky decomposition on knights corner using OpenMP." High Performance Computing: ISC High Performance 2016 International Workshops, ExaComm, E-MuCoCoS, HPC-IODC, IXPUG, IWOPH, P^ 3MA, VHPC, WOPSSS, Frankfurt, Germany, June 19–23, 2016, Revised Selected Papers 31. Springer International Publishing, 2016.)

2. Therefore, the first step is to implement tiled Cholesky decomposition algorithm.

3. This file is to implement tiled Cholesky decomposition algorithm.
reference the implementation from Intel open source project, hetero-streams which will no longer be maintained by Intel.
https://github.com/intel/hetero-streams/tree/master/ref_code/cholesky

4. Additionally, include openblas library when build:
Copy link
Collaborator

@mhaseeb123 mhaseeb123 Dec 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to install/setup OpenBLAS with CPM during CMake? Personally I don't want to add these extra manual steps of installing and export OPENBLAS_DIR=.. as they would make general usage as well as CI/CD really tedious. @weilewei

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mhaseeb123 agreed not introducing extra flags. I just tried with CPMadd, and building openblas takes a while for the first-time building it.
@hcq9102 try the following in the CMakeLists files. Additionally, I request to add a compilation flag something like USE_OPENBLAS to cmake build to separate out if we should use cpm to add openblas.

in main CmakeLists.txt

if (USE_OPENBLAS)
cpmaddpackage(NAME openblas GITHUB_REPOSITORY OpenMathLib/OpenBLAS GIT_TAG
              v0.3.25)
endif()

in your application CMakeLists.txt:

target_link_libraries(myapp openblas)

or $ export OPENBLAS_DIR=/openblas/path
*/

#include <bits/stdc++.h>
#include <cblas.h>
#include <lapacke.h>
#include <omp.h>
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include "tiled_cholesky_help.hpp"

using data_type = double;

hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
void tiled_cholesky(double* matrix_split[], const int tile_size, const int num_tiles, CBLAS_ORDER blasLay,
const int lapackLay) {
unsigned int m, n, k;

for (k = 0; k < num_tiles; ++k) {
//POTRF
int info = LAPACKE_dpotrf(LAPACK_ROW_MAJOR, 'L', tile_size, matrix_split[k * num_tiles + k], tile_size);

for (m = k + 1; m < num_tiles; ++m) {
//DTRSM
cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0,
matrix_split[k * num_tiles + k], tile_size, matrix_split[m * num_tiles + k], tile_size);
}

for (n = k + 1; n < num_tiles; ++n) {
//DSYRK
cblas_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, matrix_split[n * num_tiles + k],
tile_size, 1.0, matrix_split[n * num_tiles + n], tile_size);

for (m = n + 1; m < num_tiles; ++m) {
//DGEMM
cblas_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0,
matrix_split[m * num_tiles + k], tile_size, matrix_split[n * num_tiles + k], tile_size, 1.0,
matrix_split[m * num_tiles + n], tile_size);
}
}
}
}

int main(int argc, char** argv) {

// TODO : introduce args
int matrix_size, num_tiles, tile_size, tot_tiles;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
matrix_size = 40; // must be an input
num_tiles = 2; // matrix size MUST be divisible
int verifycorrectness = 1;
bool layRow = true;

fmt::print("matrix_size = {}, num_tiles = {}\n\n", matrix_size, num_tiles);

// Check mat_size is divisible by num_tiles
if (matrix_size % num_tiles != 0) {
fmt::print("matrix size must be divisible by num_tiles.. aborting\n");
throw std::invalid_argument("Matrix size is not divisible by num_tiles");
}

if (matrix_size == 0) {
fmt::print(" 0 illegal input matrix_size \n");
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
std::exit(0);
}

tile_size = matrix_size / num_tiles;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
tot_tiles = num_tiles * num_tiles;

double* A = new double[matrix_size * matrix_size];
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved

// Allocate memory for tiled_cholesky for the full matrix
double* A_lower = new double[matrix_size * matrix_size];

// Allocate memory for MKL cholesky for the full matrix
double* A_MKL = new double[matrix_size * matrix_size];

// Memory allocation for tiled matrix
double** Asplit = new double*[tot_tiles];

for (int i = 0; i < tot_tiles; ++i) {
// Buffer per tile
Asplit[i] = new double[tile_size * tile_size];
}

//Generate a symmetric positve-definite matrix
A = generate_positiveDefinitionMatrix(matrix_size);

//printMatrix(A, mat_size_m);

CBLAS_ORDER blasLay;
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
int lapackLay;

if (layRow) {
blasLay = CblasRowMajor;
lapackLay = LAPACK_ROW_MAJOR;
} else {
blasLay = CblasColMajor;
lapackLay = LAPACK_COL_MAJOR;
}

//copying matrices into separate variables for tiled cholesky (A_lower) and MKL cholesky (A_MKL)
std::memcpy(A_lower, A, matrix_size * matrix_size * sizeof(double));
std::memcpy(A_MKL, A, matrix_size * matrix_size * sizeof(double));

//splits the input matrix into tiles
split_into_tiles(A_lower, Asplit, num_tiles, tile_size, matrix_size, layRow);

//run the tiled Cholesky function
tiled_cholesky(Asplit, tile_size, num_tiles, blasLay, lapackLay);

//assembling seperated tiles back into full matrix
assemble_tiles(Asplit, A_lower, num_tiles, tile_size, matrix_size, layRow);

//calling LAPACKE_dpotrf cholesky for verification and timing comparison
int info = LAPACKE_dpotrf(lapackLay, 'L', matrix_size, A_MKL, matrix_size);

if (verifycorrectness == 1) {
bool res = verify_results(A_lower, A_MKL, matrix_size * matrix_size);
if (res) {
fmt::print("Tiled Cholesky decomposition successful\n");
} else {
fmt::print("Tiled Cholesky decomposition failed\n");
}
}

//memory free
delete[] A;
delete[] A_lower;
delete[] A_MKL;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit, I wonder if we can do it thorough unique_ptr instead of new and delete. not important for this PR though.

for (int i = 0; i < tot_tiles; ++i) {
delete[] Asplit[i];
}

return 0;
}
125 changes: 125 additions & 0 deletions apps/choleskey/tiled_cholesky_help.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#pragma once

#include <cblas.h>
#include <lapacke.h>
#include <stddef.h>
#include <stdlib.h>
#include <cstdlib>
#include <fstream>
#include <sstream>
#include "commons.hpp"

double* generate_positiveDefinitionMatrix(const size_t matrix_size) {
double* A_matrix = new double[matrix_size * matrix_size];
hcq9102 marked this conversation as resolved.
Show resolved Hide resolved
double* pd_matrix = (double*)malloc(matrix_size * matrix_size * sizeof(double));
unsigned int seeds = matrix_size;

// generate a random symmetric matrix
for (size_t row = 0; row < matrix_size; ++row) {
for (size_t col = row; col < matrix_size; ++col) {
A_matrix[col * matrix_size + row] = A_matrix[row * matrix_size + col] = (double)rand_r(&seeds) / RAND_MAX;
}
}
// compute the product of matrix A_matrix and its transpose, and storing the result in pd_matrix.
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, matrix_size, matrix_size, matrix_size, 1.0, A_matrix,
matrix_size, A_matrix, matrix_size, 0.0, pd_matrix, matrix_size);

// Adjust Diagonals
for (size_t row = 0; row < matrix_size; ++row) {
double diagonals = 1.0; // from 1.0
for (size_t col = 0; col < matrix_size; ++col) {
diagonals += pd_matrix[row * matrix_size + col];
}
// Set the diag entry
pd_matrix[row * matrix_size + row] = diagonals;
}

delete[] A_matrix;
return pd_matrix;
}

void split_into_tiles(const double* matrix, double* matrix_split[], const int num_tiles, const int tile_size,
const int size, bool layRow) {

int total_num_tiles = num_tiles * num_tiles;
int offset_tile;

//#pragma omp parallel for private(i, j, offset_tile) schedule(auto)
for (int i_tile = 0; i_tile < total_num_tiles; ++i_tile) {
if (layRow) {
offset_tile =
int(i_tile / num_tiles) * num_tiles * tile_size * tile_size + int(i_tile % num_tiles) * tile_size;
} else {
offset_tile =
int(i_tile % num_tiles) * num_tiles * tile_size * tile_size + int(i_tile / num_tiles) * tile_size;
}

for (int i = 0; i < tile_size; ++i)
//#pragma simd
for (int j = 0; j < tile_size; ++j) {
matrix_split[i_tile][i * tile_size + j] = matrix[offset_tile + i * size + j];
}
}
}

void assemble_tiles(double* matrix_split[], double* matrix, const int num_tiles, const int tile_size, const int size,
bool layRow) {
int i_tile, j_tile, tile, i_local, j_local;
//#pragma omp parallel for private(j, i_local, j_local, i_tile, j_tile, tile) \
schedule(auto)
for (int i = 0; i < size; ++i) {
i_local = int(i % tile_size);
i_tile = int(i / tile_size);
//#pragma simd private(j_tile, tile, j_local)
for (int j = 0; j < size; ++j) {
j_tile = int(j / tile_size);
if (layRow) {
tile = i_tile * num_tiles + j_tile;
} else {
tile = j_tile * num_tiles + i_tile;
}
j_local = int(j % tile_size);
matrix[i * size + j] = matrix_split[tile][i_local * tile_size + j_local];
}
}
}

bool verify_results(const double* lower_res, const double* dporft_res, const int totalsize) {
bool res = true;
double diff;
for (int i = 0; i < totalsize; ++i) {
diff = dporft_res[i] - lower_res[i];
if (fabs(dporft_res[i]) > 1e-5) {
diff /= dporft_res[i];
}
diff = fabs(diff);
if (diff > 1.0e-5) {
fmt::print("\nError detected at i = {}: ref {} actual {}\n", i, dporft_res[i], lower_res[i]);
res = false;
break;
}
}
return res;
}

void printMatrix(const double* matrix, size_t matrix_size) {
for (size_t row = 0; row < matrix_size; ++row) {
for (size_t col = 0; col <= row; ++col) {
std::cout << matrix[row * matrix_size + col] << "\t";
}
std::cout << std::endl;
}
}

void print_mat_split(double* matrix_split[], int num_tiles, int tile_size) {
for (int itile = 0; itile < num_tiles * num_tiles; ++itile) {
fmt::print("Block {}:\n", itile);
for (int i = 0; i < tile_size; ++i) {
for (int j = 0; j < tile_size; ++j) {
fmt::print("{} ", matrix_split[itile][i * tile_size + j]);
}
fmt::print("\n");
}
fmt::print("\n");
}
}