[LayerGroup] fix some errs and refine log

Change-Id: Ie965f219deb28a667dad475ee1894d2c015fa9a3
sophgo · Jan 9, 2023 · cf6c14b · cf6c14b
1 parent 93d89e3
commit cf6c14b
Show file tree

Hide file tree

Showing 7 changed files with 94 additions and 76 deletions.
diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupDefs.h
@@ -106,7 +106,7 @@ struct tensor_info_t {
 };
 
 using ValueSet = std::set<Value, value_compare>;
-using ValueIntMap = std::set<Value, int64_t, value_compare>;
+using ValueIntMap = std::map<Value, int64_t, value_compare>;
 using TensorInfo = std::map<Value, tensor_info_t, value_compare>;
 using MemBuff = std::map<mem_buffer_key_t, mem_buffer_value_t>;
 using MemBuffElt = std::pair<mem_buffer_key_t, mem_buffer_value_t>;

diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.h b/include/tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.h
@@ -43,25 +43,25 @@ class TimeStepMethod {
 
   void get_timestep_cycle_slack(
       BasicTimeStep *time_step, const LgInfo &lg_info,
-      std::map<Value, int64_t, value_compare> &tensor_to_cycle,
-      std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
+      ValueIntMap &tensor_to_cycle,
+      ValueIntMap &tensor_to_bufsize,
       std::vector<std::list<GdmaElt>> &tensor_timesteps,
       std::vector<int64_t> &timestep_cycle_slack);
   int64_t get_to_ts(bool &is_valid, int64_t cur_ts, TIMESTEP_LD_ST ld_st,
                     int64_t range_end);
   int64_t
   get_best_ts(BasicTimeStep *time_step, const LgInfo &lg_info, int64_t cur_ts,
-              std::map<Value, int64_t, value_compare> &tensor_to_cycle,
-              std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
+              ValueIntMap &tensor_to_cycle,
+              ValueIntMap &tensor_to_bufsize,
               std::vector<std::list<GdmaElt>> &tensor_timesteps,
               std::vector<int64_t> &timestep_cycle_slack,
               std::list<GdmaElt>::iterator &sel_list_iter);
 
   void bubble_tensor_to_best_ts(
       std::list<GdmaElt>::iterator sel_list_iter, int64_t cur_ts,
       int64_t best_ts, BasicTimeStep *time_step,
-      std::map<Value, int64_t, value_compare> &tensor_to_cycle,
-      std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
+      ValueIntMap &tensor_to_cycle,
+      ValueIntMap &tensor_to_bufsize,
       std::vector<std::list<GdmaElt>> &tensor_timesteps,
       std::vector<int64_t> &timestep_cycle_slack);
 

diff --git a/include/tpu_mlir/Dialect/Tpu/Transforms/Passes.td b/include/tpu_mlir/Dialect/Tpu/Transforms/Passes.td
@@ -30,7 +30,7 @@ def LayerGroup : Pass<"layer-group", "FuncOp"> {
   let dependentDialects = ["TpuDialect"];
   let options = [
     Option<"opt", "opt", "int64_t", /*default=*/"2",
-           "opt=1: group layers length as long as possible. opt=2: dynamic programming layer group">,
+           "opt=1: group layers as many as possible. opt=2: dynamic programming layer group">,
   ];
 }
 

diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/BasicTimeStep.cpp
@@ -94,26 +94,62 @@ void BasicTimeStep::show_timestep() {
   size_t timestep_num = get_timestep_num();
   std::string s;
   llvm::raw_string_ostream ss(s);
+
+  ValueIntMap value_ids;
+  std::map<Operation *, int64_t> op_ids;
+  int64_t idx = 0;
+  for (size_t ts = 0; ts < timestep_num; ++ts) {
+    auto &layer_field = getLayers(ts);
+    for (auto op : layer_field) {
+      if (op_ids.find(op) == op_ids.end()) {
+        op_ids[op] = idx++;
+      }
+
+      for (auto in : op->getOperands()) {
+        if (value_ids.find(in) == value_ids.end()) {
+          value_ids[in] = idx++;
+        }
+      }
+      for (auto out : get_output_values(op)) {
+        if (value_ids.find(out) == value_ids.end()) {
+          value_ids[out] = idx++;
+        }
+      }
+    }
+  }
+
+  mem_buffer_key_t buffer_key;
   for (size_t ts = 0; ts < timestep_num; ++ts) {
     s.clear();
     ss << "=== timestep " << ts << ": \n";
     const auto &layer_field = getLayers(ts);
     for (auto op : layer_field) {
-      ss << "layer " << module::getName(op) << "([";
+      ss << "layer " << op_ids[op] << "([";
       for (auto in : op->getOperands()) {
         if (in.getType().isa<NoneType>()) {
           continue;
         }
-        ss << module::getName(in) << ",";
+        buffer_key.value = in;
+        if (dyn_cast_or_null<top::WeightOp>(in.getDefiningOp())) {
+          buffer_key.type = LMEM_WEIGHT;
+        } else {
+          buffer_key.type = LMEM_ACTIVATION;
+        }
+        auto &buffer_value = get_lmem_buffer_value(buffer_key);
+        ss << value_ids[in] << "(" << buffer_value.start_ts << ", "
+           << buffer_value.end_ts << "), ";
       }
       ss << "] -> [";
       for (auto out : get_output_values(op)) {
-        ss << module::getName(out) << ",";
+        buffer_key.type = LMEM_ACTIVATION;
+        buffer_key.value = out;
+        auto &buffer_value = get_lmem_buffer_value(buffer_key);
+        ss << value_ids[out] << "(" << buffer_value.start_ts << ", "
+         << buffer_value.end_ts << "), ";
       }
       ss << "])\n";
     }
 
-    mem_buffer_key_t buffer_key;
     const auto &tensor_field = getTensors(ts);
     ss << "tensor(start_ts, end_ts): ";
     for (auto &iter : tensor_field) {
@@ -124,39 +160,13 @@ void BasicTimeStep::show_timestep() {
         buffer_key.type = LMEM_ACTIVATION;
       }
       auto &buffer_value = get_lmem_buffer_value(buffer_key);
-      ss << module::getName(iter.first) << "(" << buffer_value.start_ts << ", "
+      ss << value_ids[iter.first] << "(" << buffer_value.start_ts << ", "
          << buffer_value.end_ts << "), ";
     }
     ss << "\n";
     llvm::errs() << s;
   }
   llvm::errs() << "====================================\n";
-
-  //  llvm::errs() << "============= show time step =============\n";
-  //  std::string s;
-  //  llvm::raw_string_ostream ss(s);
-  //  for (int time_idx = 0; time_idx < this->get_timestep_num(); ++time_idx) {
-  //    s.clear();
-  //    ss << "=====Time step " << time_idx << "=====\n";
-  //    const TpuTsField &layer_field = timestep_table_[time_idx].tpu0_ts_field;
-  //    for (uint32_t i = 0; i < layer_field.size(); ++i) {
-  //      auto layer = layer_field[i];
-  //      ss << "==layer: ";
-  //      layer->print(ss);
-  //      ss << "(stage=" << this->get_layer_swpipl_stage(layer) << ")\n";
-  //    }
-  //    const GdmaTsField &tensor_field =
-  //    timestep_table_[time_idx].gdma0_ts_field; for (uint32_t i = 0; i <
-  //    tensor_field.size(); ++i) {
-  //      auto tensor = tensor_field[i].first;
-  //      ss << "==tensor: ";
-  //      tensor.print(ss);
-  //      ss << "(stage=" << this->get_tensor_swpipl_stage(tensor) << ")\n";
-  //    }
-  //    ss << "\n";
-  //    llvm::errs() << s;
-  //  }
-  //  llvm::errs() << "====================================\n";
 }
 
 void BasicTimeStep::gen_hold_coeff() {
@@ -191,15 +201,14 @@ void BasicTimeStep::gen_all_mem_buffer() {
   lmem_buffer_.clear();
 
   mem_buffer_key_t lmem_key;
-  mem_buffer_value_t lmem_value;
+  mem_buffer_value_t lmem_value={0};
   lmem_value.align_bytes = 32;
 
   for (int64_t stg = 0; stg < this->swpipl_stage_num_; ++stg) {
+    // add for software pipeline
     bool layer_timestep_valid =
         (swpipl_stage_num_ == 1) || (swpipl_stage_num_ > 1 && stg == 1);
     for (size_t ts = 0; ts < get_timestep_num(); ++ts) {
-      // add for software pipeline
-
       // process current timestep layers
       const TpuTsField &cur_tpu_field = timestep_table_[ts].tpu0_ts_field;
       if (layer_timestep_valid) {
@@ -212,8 +221,6 @@ void BasicTimeStep::gen_all_mem_buffer() {
 
             lmem_value.start_ts = ts;
             lmem_value.end_ts = -1;
-            lmem_value.addr = 0;
-            lmem_value.size = 0;
 
             lmem_buffer_[lmem_key] = lmem_value;
           }
@@ -239,8 +246,6 @@ void BasicTimeStep::gen_all_mem_buffer() {
 
           lmem_value.start_ts = ts;
           lmem_value.end_ts = ts;
-          lmem_value.addr = 0;
-          lmem_value.size = 0;
 
           lmem_buffer_[lmem_key] = lmem_value;
         } // cur_tpu_field
@@ -264,8 +269,6 @@ void BasicTimeStep::gen_all_mem_buffer() {
 
           lmem_value.start_ts = ts;
           lmem_value.end_ts = -1;
-          lmem_value.addr = 0;
-          lmem_value.size = 0;
 
           lmem_buffer_[lmem_key] = lmem_value;
         } else if (tensor_info.mode == TIMESTEP_STORE) {
@@ -435,7 +438,7 @@ int64_t BasicTimeStep::get_tensor_range_end(const GdmaElt &tensor,
       // layers
       auto &ts_layers = timestep_table_[ts].tpu0_ts_field;
       for (auto op : ts_layers) {
-        auto outs = get_output_values(op);
+        auto outs = op->getResults();
         find_flag = std::find(outs.begin(), outs.end(), v) != outs.end();
         if (find_flag) {
           result = std::min(result, ts - 1);

diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.cpp
@@ -1,12 +1,12 @@
+#include "tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.h"
 #include "mlir/Support/LLVM.h"
 #include "omp.h"
+#include "progressbar.hpp"
 #include "tpu_mlir/Backend/Arch.h"
 #include "tpu_mlir/Dialect/Tpu/IR/TpuOps.h"
 #include "tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/LayerGroupUtil.h"
-#include "tpu_mlir/Dialect/Tpu/Transforms/LayerGroup/GroupMethod.h"
 #include "tpu_mlir/Support/MathUtils.h"
 #include "tpu_mlir/Support/Module.h"
-#include "progressbar.hpp"
 #include <list>
 #include <map>
 #include <set>
@@ -229,6 +229,10 @@ void GroupMethod::sweep_for_min_cost(
 
 void GroupMethod::dynamic_programming_layer_group_with_cluster(
     std::vector<LgInfo> &lg_infos, const std::vector<Operation *> &subnet_ops) {
+  llvm::errs() << "\n"
+               << "=======================================================\n"
+               << "***** Dynamic Programming layer group with cluster ****\n"
+               << "=======================================================\n";
   cut_results_.clear();
   LgInfo sub_group;
   std::vector<std::vector<Operation *>> base_groups;
@@ -255,7 +259,7 @@ void GroupMethod::dynamic_programming_layer_group_with_cluster(
         cut_points[j][j] = j;
       }
       llvm::errs() << "Searching best group slices...\n";
-      progressbar bar(cluster_num-1);
+      progressbar bar(cluster_num - 1);
       for (size_t len = 2; len <= cluster_num; ++len) {
         bar.update();
         // llvm::errs() << llvm::format("process cluster len = %d\n", len);
@@ -297,20 +301,25 @@ void GroupMethod::dynamic_programming_layer_group_with_cluster(
 
   show_cut_results();
   // some post process for cluster
-  llvm::errs() << "start consider_redundant_computation_and_gdma_cost\n";
+  llvm::errs() << "-------------------------------------------------------\n";
+  llvm::errs() << "Consider redundant computation and gdma cost\n";
+  llvm::errs() << "-------------------------------------------------------\n";
   consider_redundant_computation_and_gdma_cost(base_groups, subnet_ops);
-  llvm::errs() << "end consider_redundant_computation_and_gdma_cost\n";
   show_cut_results();
 
-  llvm::errs() << "start merge_cut_idx_to_reduce_gdma_cost\n";
+  llvm::errs() << "-------------------------------------------------------\n";
+  llvm::errs() << "Merge cut idx to reduce gdma cost\n";
+  llvm::errs() << "-------------------------------------------------------\n";
   bool take_effective =
       merge_cut_idx_to_reduce_gdma_cost(base_groups, subnet_ops);
-  llvm::errs() << "end merge_cut_idx_to_reduce_gdma_cost\n";
   show_cut_results();
+
   if (take_effective) {
-    llvm::errs() << "start consider_redundant_computation_and_gdma_cost\n";
+    llvm::errs() << "-------------------------------------------------------\n";
+    llvm::errs() << "Consider redundant computation and gdma cost again\n"
+                 << "due to cut idx merged in the previous step\n";
+    llvm::errs() << "-------------------------------------------------------\n";
     consider_redundant_computation_and_gdma_cost(base_groups, subnet_ops);
-    llvm::errs() << "end consider_redundant_computation_and_gdma_cost\n";
     show_cut_results();
   }
 
@@ -511,6 +520,11 @@ bool GroupMethod::merge_cut_idx_to_reduce_gdma_cost(
 
 void GroupMethod::simple_layer_group(
     std::vector<LgInfo> &lg_infos, const std::vector<Operation *> &subnet_ops) {
+  llvm::errs() << "\n"
+               << "=======================================================\n"
+               << "*********** Group layers as many as possible **********\n"
+               << "=======================================================\n";
+
   cut_results_.clear();
   LgInfo sub_group;
   std::vector<std::vector<Operation *>> base_groups;
@@ -537,11 +551,11 @@ void GroupMethod::simple_layer_group(
         }
       } else {
         start_idx++;
-      }
-      if (start_idx == end_idx && start_idx > 0) {
-        cut_result.insert(cut_result.begin(), start_idx);
-        end_idx = start_idx - 1;
-        start_idx = 0;
+        if (start_idx == end_idx) {
+          cut_result.insert(cut_result.begin(), start_idx-1);
+          end_idx = start_idx - 1;
+          start_idx = 0;
+        }
       }
     }
     cut_results_.insert(cut_results_.begin(), std::move(cut_result));
@@ -586,7 +600,7 @@ void GroupMethod::get_final_groups(
 void GroupMethod::show_cut_results() {
   for (size_t i = 0; i < cut_results_.size(); ++i) {
     auto &cut_result = cut_results_[i];
-    llvm::errs() << "base group idx " << i << " cut results: ";
+    llvm::errs() << "base group[" << i << "] cut results: ";
     for (size_t j = 0; j < cut_result.size(); ++j) {
       llvm::errs() << cut_result[j] << ", ";
     }

diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/LayerGroupUtil.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/LayerGroupUtil.cpp
@@ -56,9 +56,10 @@ shape_secs_t init_group_data_secs(const LgInfo &lg_info) {
     module::getNCHW(outs[0], out_n, out_c, out_h, out_w);
     // Need consider different backends
     auto lg_op = cast<LocalGenInterface>(op);
-    total_size += lg_op.getBufferSize(
-        Arch::get_tensor_lmem_bytes(ins[0], -1, -1),
-        Arch::get_tensor_lmem_bytes(outs[0], -1, -1), in_n, in_h, out_n, out_h);
+    total_size +=
+        lg_op.getBufferSize(Arch::get_tensor_lmem_bytes(ins[0], in_n, in_h),
+                            Arch::get_tensor_lmem_bytes(outs[0], out_n, out_h),
+                            in_n, in_h, out_n, out_h);
     total_secs =
         std::min(total_secs, ceiling_func(total_size, Arch::LMEM_BYTES));
 
@@ -138,7 +139,7 @@ bool update_data_split(BasicTimeStepPtr time_step, const LgInfo &lg_info,
   bool status = false;
   auto &tensor_infos = time_step->get_tensor_infos();
   shape_secs_t max_shape_secs = get_group_max_secs(lg_info);
-  for (int64_t nsec = 1; nsec < max_shape_secs.nsecs; ++nsec) {
+  for (int64_t nsec = 1; nsec <= max_shape_secs.nsecs; ++nsec) {
     shape_secs.nsecs = nsec;
     tensor_infos.clear();
     if (stripe_mine_max_slice(lg_info, shape_secs, tensor_infos) == false) {

diff --git a/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp b/lib/Dialect/Tpu/Transforms/LayerGroup/TimeStepMethod.cpp
@@ -143,8 +143,8 @@ bool TimeStepMethod::process(BasicTimeStep *time_step, TensorInfo &tensor_infos,
 void TimeStepMethod::bubble_tensor_to_best_ts(
     std::list<GdmaElt>::iterator sel_list_iter, int64_t cur_ts, int64_t best_ts,
     BasicTimeStep *time_step,
-    std::map<Value, int64_t, value_compare> &tensor_to_cycle,
-    std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
+    ValueIntMap &tensor_to_cycle,
+    ValueIntMap &tensor_to_bufsize,
     std::vector<std::list<GdmaElt>> &tensor_timesteps,
     std::vector<int64_t> &timestep_cycle_slack) {
   // bubble the selected tensor to the right ts
@@ -226,8 +226,8 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step,
     return;
   }
   std::vector<int64_t> timestep_cycle_slack(timestep_num, 0);
-  std::map<Value, int64_t, value_compare> tensor_to_cycle;
-  std::map<Value, int64_t, value_compare> tensor_to_bufsize;
+  ValueIntMap tensor_to_cycle;
+  ValueIntMap tensor_to_bufsize;
   std::vector<std::list<GdmaElt>> tensor_timesteps;
 
 // remove it after pid_node is extracted
@@ -270,8 +270,8 @@ void TimeStepMethod::memory_aware_timestep_assignment(BasicTimeStep *time_step,
 
 void TimeStepMethod::get_timestep_cycle_slack(
     BasicTimeStep *time_step, const LgInfo &lg_info,
-    std::map<Value, int64_t, value_compare> &tensor_to_cycle,
-    std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
+    ValueIntMap &tensor_to_cycle,
+    ValueIntMap &tensor_to_bufsize,
     std::vector<std::list<GdmaElt>> &tensor_timesteps,
     std::vector<int64_t> &timestep_cycle_slack) {
   int64_t timestep_num = time_step->get_timestep_num();
@@ -320,8 +320,8 @@ int64_t TimeStepMethod::get_to_ts(bool &is_valid, int64_t cur_ts,
 
 int64_t TimeStepMethod::get_best_ts(
     BasicTimeStep *time_step, const LgInfo &lg_info, int64_t cur_ts,
-    std::map<Value, int64_t, value_compare> &tensor_to_cycle,
-    std::map<Value, int64_t, value_compare> &tensor_to_bufsize,
+    ValueIntMap &tensor_to_cycle,
+    ValueIntMap &tensor_to_bufsize,
     std::vector<std::list<GdmaElt>> &tensor_timesteps,
     std::vector<int64_t> &timestep_cycle_slack,
     std::list<GdmaElt>::iterator &sel_list_iter) {