Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Transform][Vectorization] canonicalize vector with physical vector #100

Open
wants to merge 72 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
d29f038
init
BRUCE11111 May 27, 2024
8d65cbf
add LoopInvariantCodeMotion and CSE
BRUCE11111 May 28, 2024
e6037e2
update for result use rewriter
BRUCE11111 May 30, 2024
24261e7
move functions in class
BRUCE11111 May 30, 2024
0709481
backup multireduction canonicalization
BRUCE11111 Jun 3, 2024
e04eaf6
update reduce operation
BRUCE11111 Jun 4, 2024
75e5546
update reduce
BRUCE11111 Jun 5, 2024
a648a12
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Jun 5, 2024
b80822b
record
BRUCE11111 Jun 8, 2024
a7f0e21
update reduce
BRUCE11111 Jun 25, 2024
9e4364b
record
BRUCE11111 Jun 27, 2024
b01472a
fix tests
BRUCE11111 Jul 5, 2024
947e5c0
temp record, please reset back
BRUCE11111 Jul 5, 2024
1101e86
temp record, please reset back
BRUCE11111 Jul 5, 2024
a943154
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Jul 5, 2024
7200565
add check test
BRUCE11111 Jul 8, 2024
26b2ab8
simplify code
BRUCE11111 Jul 11, 2024
380f173
refactor partial compitable operation fusion
BRUCE11111 Jul 18, 2024
7a71b25
fix reduce bug
BRUCE11111 Jul 22, 2024
ab7c4d0
add 16x16 transpose kernel
BRUCE11111 Jul 23, 2024
72f1a8b
update reduce, add shapecast, add single matmul test
BRUCE11111 Jul 25, 2024
6e1adc9
Merge remote-tracking branch 'origin/main' into xiaohui/vectorization
BRUCE11111 Jul 28, 2024
4c2b3b8
fix bugs
BRUCE11111 Jul 31, 2024
39524f3
fix wrong permutation map due to community pass greedy fold bug
BRUCE11111 Aug 1, 2024
ca28f7c
fix reduce bugs
BRUCE11111 Aug 3, 2024
37ea49b
fix useless vector operation
BRUCE11111 Aug 13, 2024
6480b46
Merge remote-tracking branch 'origin' into xiaohui/vectorization
BRUCE11111 Aug 13, 2024
5161921
update lowr tile vector code
BRUCE11111 Aug 16, 2024
747f63a
remove lower tile part
BRUCE11111 Aug 22, 2024
a38e34a
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Aug 22, 2024
788452e
update
BRUCE11111 Aug 22, 2024
894f268
disable printer
BRUCE11111 Aug 22, 2024
37a0447
fix transpose segmentation fault
BRUCE11111 Aug 23, 2024
a804c2d
update transpose index
BRUCE11111 Aug 28, 2024
7f7fb86
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Aug 29, 2024
c1d7136
simplify analyzer code
BRUCE11111 Aug 29, 2024
545cf98
fix clang-format
BRUCE11111 Aug 29, 2024
87e69e1
simplify multireduction generate loop code
BRUCE11111 Aug 30, 2024
dc00e70
update test
BRUCE11111 Aug 30, 2024
7ac81c7
simplify code
BRUCE11111 Sep 2, 2024
284a97b
add test
BRUCE11111 Sep 3, 2024
562657c
simplify reduction parallel generate for loop
BRUCE11111 Sep 3, 2024
3db8b18
add some comments
BRUCE11111 Sep 5, 2024
0c5e5a4
simplify nestedforloop generate
BRUCE11111 Sep 5, 2024
8bdf984
fix too many parameters in function
BRUCE11111 Sep 6, 2024
db59850
simplify function parameters
BRUCE11111 Sep 7, 2024
0353101
update
BRUCE11111 Sep 9, 2024
a08a3f8
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Sep 9, 2024
50d2e76
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Sep 9, 2024
10e73f4
test ci
BRUCE11111 Sep 9, 2024
eec389f
fix clang-format
BRUCE11111 Sep 9, 2024
cdbe4e2
fix format
BRUCE11111 Sep 9, 2024
27536a1
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Sep 9, 2024
5bf4a9f
enable mincrokernel op in vector
BRUCE11111 Sep 9, 2024
b0a26cd
fix comments
BRUCE11111 Sep 9, 2024
dfa5ea3
add comments
BRUCE11111 Sep 10, 2024
ffc5569
fix clang-tidy
BRUCE11111 Sep 10, 2024
32f20dd
remove unused function
BRUCE11111 Sep 10, 2024
a4382c2
split analysis file
BRUCE11111 Sep 13, 2024
1f0e3ce
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Sep 13, 2024
a79147c
add utils.cpp
BRUCE11111 Sep 13, 2024
3552eb6
fix license
BRUCE11111 Sep 13, 2024
3ef9f3a
fix code stype
BRUCE11111 Sep 13, 2024
a1f9988
enable broadcast op fusion
BRUCE11111 Sep 13, 2024
47687f1
add lower to vector pass in pipeline
BRUCE11111 Sep 13, 2024
1a2a9a1
use linalgx utils function
BRUCE11111 Sep 13, 2024
f96c544
temporaty save
BRUCE11111 Sep 14, 2024
bec59ab
rename file name
BRUCE11111 Sep 18, 2024
4c901c4
push local change
BRUCE11111 Sep 18, 2024
9b6e6c8
fix reduce loop indice
BRUCE11111 Sep 19, 2024
aac20a0
simplify code
BRUCE11111 Sep 20, 2024
9a62f0b
Merge branch 'main' into xiaohui/vectorization
BRUCE11111 Sep 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 291 additions & 0 deletions include/gc/Analysis/VectorBasedFusionAnalysis.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
//===-- VectorBasedFusionAnalysis.h - vector fusion analysis ----*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_ANALYSIS_VECTORBASEDFUSIONANALYSIS_H
#define MLIR_ANALYSIS_VECTORBASEDFUSIONANALYSIS_H

#include "gc/Dialect/Linalgx/LinalgxOps.h"
#include "gc/Dialect/Microkernel/MicrokernelOps.h"
#include "gc/Transforms/Passes.h"
#include "gc/Transforms/Utils/VectorUtils.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Traits.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinDialect.h"
#include "mlir/IR/BuiltinTypes.h"
#include "llvm/ADT/TypeSwitch.h"
#include <queue>

namespace mlir {
namespace gc {

/// record hardware information
struct HardWareInfo {
bool favx512f = true;
bool favx2 = true;
};

/// Vector type conversion helper class
class TypeHelper {
private:
HardWareInfo info;

public:
TypeHelper() = default;
TypeHelper(HardWareInfo info) : info(info) {}
/// get current hardware information
HardWareInfo &getHardwareInfo() { return this->info; }
/// use \param info to set hardware information
void setHardWareInfo(HardWareInfo &info) { this->info = info; }
/// get vector \param type max loop step according to hardware information
int getDataTypeValidSteps(VectorType type);
/// get vector \param type an even for loop step
int generateValidSteps(int steps, VectorType type);
/// get vector \param type an even for loop step when shape dimension is
/// shapeDim
int generateValidSteps(int steps, VectorType type, int shapeDim);
/// get vector \param type max simd length according to hardware
/// information
int getDataTypeMAXSIMDLength(VectorType type);
/// get operation's vector type
VectorType getVectorzedType(Operation *op, uint32_t loopStep = 0);
};

/// operation return kind, which is used to determine whether the operation
/// need to return it's result in current for loop
enum class ReturnTypeKind {
RT_Both,
RT_OutGroup,
RT_InGroup,
};

class VectorFusionBase {

private:
/// current function IR
func::FuncOp func;
/// Type helper class, can help us to get operation type
TypeHelper typehelper;

public:
VectorFusionBase() = default;
VectorFusionBase(func::FuncOp &func, HardWareInfo &info)
: func(func), typehelper(info) {}
VectorFusionBase(VectorFusionBase &base)
: func(base.getFunction()), typehelper(base.getHardwareInfo()) {}

/// get current function IR
func::FuncOp &getFunction() { return func; }
/// get current hardware info
HardWareInfo &getHardwareInfo() { return typehelper.getHardwareInfo(); }
TypeHelper &getTypeHelper() { return typehelper; }
};

/// Group operation fusion strategy class.
/// 1. Classify operaions:
/// classify the operations into :
/// a. reorder, transpose. Reorder(or transpose) dim may bring data
/// dependency.
/// b. elemenwise. Those operations can be fused into a common for loop.
/// c. broadcast. Need to analysis broadcast dim and the data
/// dependency.
/// d. reduction. Need to analysis broadcast dim and the
/// data dependency.
/// Same group operations have no data dependencies. They can be fused into a
/// common for loop body.

/// Using queue to store the operation order. In order to ensure that
/// subsequent moves to the operation will not cause semantic changes.
class GroupOperationFusion : public VectorFusionBase {
private:
/// operation groups, operations in each group can generate a common for
/// loop
SmallVector<std::queue<Operation *>, 8> opGroups;
/// group max vectorize steps
SmallVector<uint32_t, 8> groupMaxSteps;
/// vector type which has bigest rank in current operation group
llvm::SmallDenseMap<size_t, VectorType> groupBigestRankVectorType;
/// query current operation in which group, return group index
DenseMap<Operation *, size_t> opGroupIndexMap;
/// can fused into prev operation which axis position
DenseMap<Operation *, size_t> opAnchorPos;
/// record some operations which not need to No need to judge whether can be
/// fused
std::queue<Operation *> notNeedToJudgeOps;
/// analysis the operation's operands and results
SmallVector<llvm::MapVector<Value, std::pair<ReturnTypeKind, size_t>>, 8>
groupOpResults;
/// store loop iteration args for each of operation group
SmallVector<SetVector<Value>, 8> groupOpInitArgs;
// store read and write operations permutation maps in order to convenient
// to replace loop induction var
DenseMap<Operation *, AffineMap> opPermuationMap;
/// record operation operand original operate value
DenseMap<Value, Value> operandOriginalValue;

public:
GroupOperationFusion(func::FuncOp &func, HardWareInfo &info)
: VectorFusionBase(func, info) {}

GroupOperationFusion(GroupOperationFusion &strategy)
: VectorFusionBase(strategy.getFunction(), strategy.getHardwareInfo()),
opGroups(strategy.opGroups), groupMaxSteps(strategy.groupMaxSteps),
opGroupIndexMap(strategy.opGroupIndexMap),
opAnchorPos(strategy.opAnchorPos){};

GroupOperationFusion(GroupOperationFusion &&strategy)
: VectorFusionBase(strategy.getFunction(), strategy.getHardwareInfo()),
opGroups(std::move(strategy.opGroups)),
groupMaxSteps(std::move(strategy.groupMaxSteps)),
groupBigestRankVectorType(
std::move(strategy.getGroupBiggestRankVectorType())),
opGroupIndexMap(std::move(strategy.opGroupIndexMap)),
opAnchorPos(std::move(strategy.opAnchorPos)){};

GroupOperationFusion &operator=(GroupOperationFusion &fusion) {
this->getOpGroups() = fusion.getOpGroups();
this->getGroupMaxSteps() = fusion.getGroupMaxSteps();
this->getGroupBiggestRankVectorType() =
fusion.getGroupBiggestRankVectorType();
this->getOpGroupIndexMap() = fusion.getOpGroupIndexMap();
this->getOpAnchorPos() = fusion.getOpAnchorPos();
this->notNeedToJudgeOps = fusion.notNeedToJudgeOps;
this->getGroupOpResults() = fusion.getGroupOpResults();
this->getGroupOpInitArgs() = fusion.getGroupOpInitArgs();
this->getOpPermuationMap() = fusion.getOpPermuationMap();
this->getOperandOriginalValue() = fusion.getOperandOriginalValue();
this->getFunction() = fusion.getFunction();
this->getHardwareInfo() = fusion.getHardwareInfo();
this->getTypeHelper() = fusion.getTypeHelper();
return *this;
};
GroupOperationFusion &operator=(GroupOperationFusion &&) = default;

/// Get the map which contains each group vector type which has biggest
/// rank.
llvm::SmallDenseMap<size_t, VectorType> &
getGroupBiggestRankVectorType() noexcept {
return groupBigestRankVectorType;
};
/// Get the operation group obtained by fusion strategy analysis
SmallVector<std::queue<Operation *>, 8> &getOpGroups() noexcept {
return opGroups;
}
/// Get the operation belong to which group index map
DenseMap<Operation *, size_t> &getOpGroupIndexMap() noexcept {
return opGroupIndexMap;
}
/// Get the map contains max steps of each group
SmallVector<uint32_t, 8> &getGroupMaxSteps() noexcept {
return groupMaxSteps;
}
/// Get the map contains anchor position of each operation
DenseMap<Operation *, size_t> &getOpAnchorPos() noexcept {
return opAnchorPos;
}
/// get current operation group results
SmallVector<llvm::MapVector<Value, std::pair<ReturnTypeKind, size_t>>, 8> &
getGroupOpResults() noexcept {
return groupOpResults;
}

SmallVector<SetVector<Value>, 8> &getGroupOpInitArgs() noexcept {
return groupOpInitArgs;
}

DenseMap<Operation *, AffineMap> &getOpPermuationMap() noexcept {
return opPermuationMap;
}

DenseMap<Value, Value> &getOperandOriginalValue() noexcept {
return operandOriginalValue;
}
/// set operation groups
void setGroupOpResults(
const SmallVector<
llvm::MapVector<Value, std::pair<ReturnTypeKind, size_t>>, 8>
&results) {
groupOpResults = std::move(results);
}

void setGroupOpIterArgs(
const SmallVector<llvm::SetVector<Value>, 8> &initArgs) noexcept {
groupOpInitArgs = std::move(initArgs);
}

void setPermutationMap(const DenseMap<Operation *, AffineMap> &map) noexcept {
opPermuationMap = std::move(map);
}
/// Do fusion strategy
void classifyOperations();

/// Whether two operations have compatible vector shapes
bool isCompatibleVectorType(Operation *op1, Operation *op2);

/// update bigest vector type for last operation group
void updateGroupBigestVectorType(VectorType vectorType);

/// Check whether the operation can fuse with previous operation
bool isNeedNewGroup(Operation *op);

/// Add Operation \p op into current last group or a new Group
/// \p op must has valid value, can't be nullptr
void addOperationToGroup(Operation *op);

/// get next operation in current operation group
template <typename Target>
Operation *getNextTargetOperationInCurrentGroup(Operation *curOp,
const size_t grpIdx);

/// run the vector-based fusion strategy
void run();
};

template <typename Target>
Operation *GroupOperationFusion::getNextTargetOperationInCurrentGroup(
Operation *curOp, const size_t grpIdx) {
std::queue<Operation *> tmpOpQueue(getOpGroups()[grpIdx]);
if (isa<Target>(curOp))
return curOp;

while (!tmpOpQueue.empty()) {
auto frontOp = tmpOpQueue.front();
if (isa<Target>(frontOp)) {
for (auto x : frontOp->getOperands())
if (x.getDefiningOp() == curOp)
return frontOp;
}
tmpOpQueue.pop();
}
return nullptr;
}

class GroupOperationAnalysis {
private:
/// vector-based fusion related data
GroupOperationFusion fusionStrategy;

public:
GroupOperationAnalysis(func::FuncOp &func, HardWareInfo &info)
: fusionStrategy(func, info) {}
/// remove the useless operation, due to it result is not require by other
/// operation
void analysisEmptyGroup();
/// get each operation in each group maximum support vectorization length
void analysisGroupMaxSteps();
/// get fusion strategy
GroupOperationFusion &getGroupOperationFusion() { return fusionStrategy; }

void run() { fusionStrategy.run(); }
};
} // namespace gc
} // namespace mlir

#endif
15 changes: 15 additions & 0 deletions include/gc/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,21 @@ def MergeNestedForall : Pass<"merge-nested-forall"> {
let dependentDialects = ["scf::SCFDialect"];
}

def CPUPhysicalRegisterPass : Pass<"CPU-physical-register-pass", "func::FuncOp"> {
let summary = "Lower operation to cpu pysical register size.";
let description = [{
Physical register size lowering pass.
}];
let dependentDialects = [
"::mlir::func::FuncDialect",
"::mlir::math::MathDialect",
"::mlir::arith::ArithDialect",
"::mlir::tensor::TensorDialect",
"::mlir::vector::VectorDialect",
"::mlir::scf::SCFDialect",
];
}

def FoldTensorOperation : Pass<"fold-tensor-operation"> {
let summary = "Fold some tensor operation";
let description = [{
Expand Down
Loading
Loading