Skip to content

Commit

Permalink
[LayerGroup] fix some errs and refine log
Browse files Browse the repository at this point in the history
Change-Id: Ie965f219deb28a667dad475ee1894d2c015fa9a3
  • Loading branch information
Boatin committed Jan 9, 2023
1 parent 93d89e3 commit cf6c14b
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ struct tensor_info_t {
};

using ValueSet = std::set<Value, value_compare>;
using ValueIntMap = std::set<Value, int64_t, value_compare>;
using ValueIntMap = std::map<Value, int64_t, value_compare>;
using TensorInfo = std::map<Value, tensor_info_t, value_compare>;
using MemBuff = std::map<mem_buffer_key_t, mem_buffer_value_t>;
using MemBuffElt = std::pair<mem_buffer_key_t, mem_buffer_value_t>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,25 +43,25 @@ class TimeStepMethod {

void get_timestep_cycle_slack(
BasicTimeStep *time_step, const LgInfo &lg_info,
std::map<Value, int64_t, value_compare> &tensor_to_cycle,
std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
ValueIntMap &tensor_to_cycle,
ValueIntMap &tensor_to_bufsize,
std::vector<std::list<GdmaElt>> &tensor_timesteps,
std::vector<int64_t> &timestep_cycle_slack);
int64_t get_to_ts(bool &is_valid, int64_t cur_ts, TIMESTEP_LD_ST ld_st,
int64_t range_end);
int64_t
get_best_ts(BasicTimeStep *time_step, const LgInfo &lg_info, int64_t cur_ts,
std::map<Value, int64_t, value_compare> &tensor_to_cycle,
std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
ValueIntMap &tensor_to_cycle,
ValueIntMap &tensor_to_bufsize,
std::vector<std::list<GdmaElt>> &tensor_timesteps,
std::vector<int64_t> &timestep_cycle_slack,
std::list<GdmaElt>::iterator &sel_list_iter);

void bubble_tensor_to_best_ts(
std::list<GdmaElt>::iterator sel_list_iter, int64_t cur_ts,
int64_t best_ts, BasicTimeStep *time_step,
std::map<Value, int64_t, value_compare> &tensor_to_cycle,
std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
ValueIntMap &tensor_to_cycle,
ValueIntMap &tensor_to_bufsize,
std::vector<std::list<GdmaElt>> &tensor_timesteps,
std::vector<int64_t> &timestep_cycle_slack);

Expand Down
2 changes: 1 addition & 1 deletion include/tpu_mlir/Dialect/Tpu/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def LayerGroup : Pass<"layer-group", "FuncOp"> {
let dependentDialects = ["TpuDialect"];
let options = [
Option<"opt", "opt", "int64_t", /*default=*/"2",
"opt=1: group layers length as long as possible. opt=2: dynamic programming layer group">,
"opt=1: group layers as many as possible. opt=2: dynamic programming layer group">,
];
}

Expand Down
85 changes: 44 additions & 41 deletions lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,26 +94,62 @@ void BasicTimeStep::show_timestep() {
size_t timestep_num = get_timestep_num();
std::string s;
llvm::raw_string_ostream ss(s);

ValueIntMap value_ids;
std::map<Operation *, int64_t> op_ids;
int64_t idx = 0;
for (size_t ts = 0; ts < timestep_num; ++ts) {
auto &layer_field = getLayers(ts);
for (auto op : layer_field) {
if (op_ids.find(op) == op_ids.end()) {
op_ids[op] = idx++;
}

for (auto in : op->getOperands()) {
if (value_ids.find(in) == value_ids.end()) {
value_ids[in] = idx++;
}
}
for (auto out : get_output_values(op)) {
if (value_ids.find(out) == value_ids.end()) {
value_ids[out] = idx++;
}
}
}
}

mem_buffer_key_t buffer_key;
for (size_t ts = 0; ts < timestep_num; ++ts) {
s.clear();
ss << "=== timestep " << ts << ": \n";
const auto &layer_field = getLayers(ts);
for (auto op : layer_field) {
ss << "layer " << module::getName(op) << "([";
ss << "layer " << op_ids[op] << "([";
for (auto in : op->getOperands()) {
if (in.getType().isa<NoneType>()) {
continue;
}
ss << module::getName(in) << ",";
buffer_key.value = in;
if (dyn_cast_or_null<top::WeightOp>(in.getDefiningOp())) {
buffer_key.type = LMEM_WEIGHT;
} else {
buffer_key.type = LMEM_ACTIVATION;
}
auto &buffer_value = get_lmem_buffer_value(buffer_key);
ss << value_ids[in] << "(" << buffer_value.start_ts << ", "
<< buffer_value.end_ts << "), ";
}
ss << "] -> [";
for (auto out : get_output_values(op)) {
ss << module::getName(out) << ",";
buffer_key.type = LMEM_ACTIVATION;
buffer_key.value = out;
auto &buffer_value = get_lmem_buffer_value(buffer_key);
ss << value_ids[out] << "(" << buffer_value.start_ts << ", "
<< buffer_value.end_ts << "), ";
}
ss << "])\n";
}

mem_buffer_key_t buffer_key;
const auto &tensor_field = getTensors(ts);
ss << "tensor(start_ts, end_ts): ";
for (auto &iter : tensor_field) {
Expand All @@ -124,39 +160,13 @@ void BasicTimeStep::show_timestep() {
buffer_key.type = LMEM_ACTIVATION;
}
auto &buffer_value = get_lmem_buffer_value(buffer_key);
ss << module::getName(iter.first) << "(" << buffer_value.start_ts << ", "
ss << value_ids[iter.first] << "(" << buffer_value.start_ts << ", "
<< buffer_value.end_ts << "), ";
}
ss << "\n";
llvm::errs() << s;
}
llvm::errs() << "====================================\n";

// llvm::errs() << "============= show time step =============\n";
// std::string s;
// llvm::raw_string_ostream ss(s);
// for (int time_idx = 0; time_idx < this->get_timestep_num(); ++time_idx) {
// s.clear();
// ss << "=====Time step " << time_idx << "=====\n";
// const TpuTsField &layer_field = timestep_table_[time_idx].tpu0_ts_field;
// for (uint32_t i = 0; i < layer_field.size(); ++i) {
// auto layer = layer_field[i];
// ss << "==layer: ";
// layer->print(ss);
// ss << "(stage=" << this->get_layer_swpipl_stage(layer) << ")\n";
// }
// const GdmaTsField &tensor_field =
// timestep_table_[time_idx].gdma0_ts_field; for (uint32_t i = 0; i <
// tensor_field.size(); ++i) {
// auto tensor = tensor_field[i].first;
// ss << "==tensor: ";
// tensor.print(ss);
// ss << "(stage=" << this->get_tensor_swpipl_stage(tensor) << ")\n";
// }
// ss << "\n";
// llvm::errs() << s;
// }
// llvm::errs() << "====================================\n";
}

void BasicTimeStep::gen_hold_coeff() {
Expand Down Expand Up @@ -191,15 +201,14 @@ void BasicTimeStep::gen_all_mem_buffer() {
lmem_buffer_.clear();

mem_buffer_key_t lmem_key;
mem_buffer_value_t lmem_value;
mem_buffer_value_t lmem_value={0};
lmem_value.align_bytes = 32;

for (int64_t stg = 0; stg < this->swpipl_stage_num_; ++stg) {
// add for software pipeline
bool layer_timestep_valid =
(swpipl_stage_num_ == 1) || (swpipl_stage_num_ > 1 && stg == 1);
for (size_t ts = 0; ts < get_timestep_num(); ++ts) {
// add for software pipeline

// process current timestep layers
const TpuTsField &cur_tpu_field = timestep_table_[ts].tpu0_ts_field;
if (layer_timestep_valid) {
Expand All @@ -212,8 +221,6 @@ void BasicTimeStep::gen_all_mem_buffer() {

lmem_value.start_ts = ts;
lmem_value.end_ts = -1;
lmem_value.addr = 0;
lmem_value.size = 0;

lmem_buffer_[lmem_key] = lmem_value;
}
Expand All @@ -239,8 +246,6 @@ void BasicTimeStep::gen_all_mem_buffer() {

lmem_value.start_ts = ts;
lmem_value.end_ts = ts;
lmem_value.addr = 0;
lmem_value.size = 0;

lmem_buffer_[lmem_key] = lmem_value;
} // cur_tpu_field
Expand All @@ -264,8 +269,6 @@ void BasicTimeStep::gen_all_mem_buffer() {

lmem_value.start_ts = ts;
lmem_value.end_ts = -1;
lmem_value.addr = 0;
lmem_value.size = 0;

lmem_buffer_[lmem_key] = lmem_value;
} else if (tensor_info.mode == TIMESTEP_STORE) {
Expand Down Expand Up @@ -435,7 +438,7 @@ int64_t BasicTimeStep::get_tensor_range_end(const GdmaElt &tensor,
// layers
auto &ts_layers = timestep_table_[ts].tpu0_ts_field;
for (auto op : ts_layers) {
auto outs = get_output_values(op);
auto outs = op->getResults();
find_flag = std::find(outs.begin(), outs.end(), v) != outs.end();
if (find_flag) {
result = std::min(result, ts - 1);
Expand Down
44 changes: 29 additions & 15 deletions lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#include "tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.h"
#include "mlir/Support/LLVM.h"
#include "omp.h"
#include "progressbar.hpp"
#include "tpu_mlir/Backend/Arch.h"
#include "tpu_mlir/Dialect/Tpu/IR/TpuOps.h"
#include "tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupUtil.h"
#include "tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.h"
#include "tpu_mlir/Support/MathUtils.h"
#include "tpu_mlir/Support/Module.h"
#include "progressbar.hpp"
#include <list>
#include <map>
#include <set>
Expand Down Expand Up @@ -229,6 +229,10 @@ void GroupMethod::sweep_for_min_cost(

void GroupMethod::dynamic_programming_layer_group_with_cluster(
std::vector<LgInfo> &lg_infos, const std::vector<Operation *> &subnet_ops) {
llvm::errs() << "\n"
<< "=======================================================\n"
<< "***** Dynamic Programming layer group with cluster ****\n"
<< "=======================================================\n";
cut_results_.clear();
LgInfo sub_group;
std::vector<std::vector<Operation *>> base_groups;
Expand All @@ -255,7 +259,7 @@ void GroupMethod::dynamic_programming_layer_group_with_cluster(
cut_points[j][j] = j;
}
llvm::errs() << "Searching best group slices...\n";
progressbar bar(cluster_num-1);
progressbar bar(cluster_num - 1);
for (size_t len = 2; len <= cluster_num; ++len) {
bar.update();
// llvm::errs() << llvm::format("process cluster len = %d\n", len);
Expand Down Expand Up @@ -297,20 +301,25 @@ void GroupMethod::dynamic_programming_layer_group_with_cluster(

show_cut_results();
// some post process for cluster
llvm::errs() << "start consider_redundant_computation_and_gdma_cost\n";
llvm::errs() << "-------------------------------------------------------\n";
llvm::errs() << "Consider redundant computation and gdma cost\n";
llvm::errs() << "-------------------------------------------------------\n";
consider_redundant_computation_and_gdma_cost(base_groups, subnet_ops);
llvm::errs() << "end consider_redundant_computation_and_gdma_cost\n";
show_cut_results();

llvm::errs() << "start merge_cut_idx_to_reduce_gdma_cost\n";
llvm::errs() << "-------------------------------------------------------\n";
llvm::errs() << "Merge cut idx to reduce gdma cost\n";
llvm::errs() << "-------------------------------------------------------\n";
bool take_effective =
merge_cut_idx_to_reduce_gdma_cost(base_groups, subnet_ops);
llvm::errs() << "end merge_cut_idx_to_reduce_gdma_cost\n";
show_cut_results();

if (take_effective) {
llvm::errs() << "start consider_redundant_computation_and_gdma_cost\n";
llvm::errs() << "-------------------------------------------------------\n";
llvm::errs() << "Consider redundant computation and gdma cost again\n"
<< "due to cut idx merged in the previous step\n";
llvm::errs() << "-------------------------------------------------------\n";
consider_redundant_computation_and_gdma_cost(base_groups, subnet_ops);
llvm::errs() << "end consider_redundant_computation_and_gdma_cost\n";
show_cut_results();
}

Expand Down Expand Up @@ -511,6 +520,11 @@ bool GroupMethod::merge_cut_idx_to_reduce_gdma_cost(

void GroupMethod::simple_layer_group(
std::vector<LgInfo> &lg_infos, const std::vector<Operation *> &subnet_ops) {
llvm::errs() << "\n"
<< "=======================================================\n"
<< "*********** Group layers as many as possible **********\n"
<< "=======================================================\n";

cut_results_.clear();
LgInfo sub_group;
std::vector<std::vector<Operation *>> base_groups;
Expand All @@ -537,11 +551,11 @@ void GroupMethod::simple_layer_group(
}
} else {
start_idx++;
}
if (start_idx == end_idx && start_idx > 0) {
cut_result.insert(cut_result.begin(), start_idx);
end_idx = start_idx - 1;
start_idx = 0;
if (start_idx == end_idx) {
cut_result.insert(cut_result.begin(), start_idx-1);
end_idx = start_idx - 1;
start_idx = 0;
}
}
}
cut_results_.insert(cut_results_.begin(), std::move(cut_result));
Expand Down Expand Up @@ -586,7 +600,7 @@ void GroupMethod::get_final_groups(
void GroupMethod::show_cut_results() {
for (size_t i = 0; i < cut_results_.size(); ++i) {
auto &cut_result = cut_results_[i];
llvm::errs() << "base group idx " << i << " cut results: ";
llvm::errs() << "base group[" << i << "] cut results: ";
for (size_t j = 0; j < cut_result.size(); ++j) {
llvm::errs() << cut_result[j] << ", ";
}
Expand Down
9 changes: 5 additions & 4 deletions lib/Dialect/Tpu/Transforms/LayerGroup/LayerGroupUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ shape_secs_t init_group_data_secs(const LgInfo &lg_info) {
module::getNCHW(outs[0], out_n, out_c, out_h, out_w);
// Need consider different backends
auto lg_op = cast<LocalGenInterface>(op);
total_size += lg_op.getBufferSize(
Arch::get_tensor_lmem_bytes(ins[0], -1, -1),
Arch::get_tensor_lmem_bytes(outs[0], -1, -1), in_n, in_h, out_n, out_h);
total_size +=
lg_op.getBufferSize(Arch::get_tensor_lmem_bytes(ins[0], in_n, in_h),
Arch::get_tensor_lmem_bytes(outs[0], out_n, out_h),
in_n, in_h, out_n, out_h);
total_secs =
std::min(total_secs, ceiling_func(total_size, Arch::LMEM_BYTES));

Expand Down Expand Up @@ -138,7 +139,7 @@ bool update_data_split(BasicTimeStepPtr time_step, const LgInfo &lg_info,
bool status = false;
auto &tensor_infos = time_step->get_tensor_infos();
shape_secs_t max_shape_secs = get_group_max_secs(lg_info);
for (int64_t nsec = 1; nsec < max_shape_secs.nsecs; ++nsec) {
for (int64_t nsec = 1; nsec <= max_shape_secs.nsecs; ++nsec) {
shape_secs.nsecs = nsec;
tensor_infos.clear();
if (stripe_mine_max_slice(lg_info, shape_secs, tensor_infos) == false) {
Expand Down
16 changes: 8 additions & 8 deletions lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,8 @@ bool TimeStepMethod::process(BasicTimeStep *time_step, TensorInfo &tensor_infos,
void TimeStepMethod::bubble_tensor_to_best_ts(
std::list<GdmaElt>::iterator sel_list_iter, int64_t cur_ts, int64_t best_ts,
BasicTimeStep *time_step,
std::map<Value, int64_t, value_compare> &tensor_to_cycle,
std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
ValueIntMap &tensor_to_cycle,
ValueIntMap &tensor_to_bufsize,
std::vector<std::list<GdmaElt>> &tensor_timesteps,
std::vector<int64_t> &timestep_cycle_slack) {
// bubble the selected tensor to the right ts
Expand Down Expand Up @@ -226,8 +226,8 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step,
return;
}
std::vector<int64_t> timestep_cycle_slack(timestep_num, 0);
std::map<Value, int64_t, value_compare> tensor_to_cycle;
std::map<Value, int64_t, value_compare> tensor_to_bufsize;
ValueIntMap tensor_to_cycle;
ValueIntMap tensor_to_bufsize;
std::vector<std::list<GdmaElt>> tensor_timesteps;

// remove it after pid_node is extracted
Expand Down Expand Up @@ -270,8 +270,8 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step,

void TimeStepMethod::get_timestep_cycle_slack(
BasicTimeStep *time_step, const LgInfo &lg_info,
std::map<Value, int64_t, value_compare> &tensor_to_cycle,
std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
ValueIntMap &tensor_to_cycle,
ValueIntMap &tensor_to_bufsize,
std::vector<std::list<GdmaElt>> &tensor_timesteps,
std::vector<int64_t> &timestep_cycle_slack) {
int64_t timestep_num = time_step->get_timestep_num();
Expand Down Expand Up @@ -320,8 +320,8 @@ int64_t TimeStepMethod::get_to_ts(bool &is_valid, int64_t cur_ts,

int64_t TimeStepMethod::get_best_ts(
BasicTimeStep *time_step, const LgInfo &lg_info, int64_t cur_ts,
std::map<Value, int64_t, value_compare> &tensor_to_cycle,
std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
ValueIntMap &tensor_to_cycle,
ValueIntMap &tensor_to_bufsize,
std::vector<std::list<GdmaElt>> &tensor_timesteps,
std::vector<int64_t> &timestep_cycle_slack,
std::list<GdmaElt>::iterator &sel_list_iter) {
Expand Down

0 comments on commit cf6c14b

Please sign in to comment.