apache · xinyual · Dec 3, 2021 · Dec 3, 2021 · Dec 3, 2021 · Dec 3, 2021
diff --git a/3rdparty/onednn b/3rdparty/onednn
@@ -132,7 +132,9 @@ class Executor {
                                       1,
                                       nullptr,
                                       nullptr),
-                 0);
+                 0,
+                 nullptr,
+                 nullptr);
       } else {
         CHECK_EQ(MXAutogradBackwardEx(out_handles.size(),
                                       out_handles.data(),
@@ -144,7 +146,10 @@ class Executor {
                                       1,
                                       nullptr,
                                       nullptr),
-                 0);
+                 0,
+                 0,
+                 nullptr,
+                 nullptr);
       }
       grad_arrays.clear();
       grad_arrays.reserve(arg_arrays.size());

@@ -49,4 +49,4 @@ MXReturnValue initialize(int version) {
     MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl;
     return MX_FAIL;
   }
-}
+}
@@ -67,4 +67,4 @@ def test_model(pass_name):
     sym_block2.optimize_for(mx.nd.ones((3,2)), mx.nd.ones((3,2)), backend=pass_name)
     sym_block2.export('modified')
 
-test_model('myPass')
+test_model('myPass')
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all: pass_lib
+
+pass_lib:
+	g++ -shared -fPIC -std=c++11 add_reduce_op.cc ../../../src/lib_api.cc -o add_reduce_op_lib.so -I ../../../include
+
+clean:
+	rm -rf libpass_lib.so
@@ -0,0 +1,76 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  ~
+-->
+
+Add Reduce operation to computation Graph
+=======================================
+
+## Introduction
+This is the part of work of transferring [DeepSpeed's work](https://arxiv.org/abs/1910.02054) into MXNet.
+Since the difference between symbolic and imperative, we divide the whole proecss into two phases:  
+
+phase 1: Add reduce operation into graph. The reduce operation will do nothing
+in forward but reduce the gradient to the right GPU(according to POS-trainer).  
+
+phase2: In backward graph, delete the outputs in arrays so the memory planner can reuse such memory.  
+
+ ## Getting start 
+ ### Prepare NCCL and horovod
+ Since we use horovod to communicate, please firstly install horovod. And we use NCCL reduce, please also install it.  
+
+ ### Complie the Graph Pass and load
+ Please firstly compile it like [lib pass](../lib_pass/). Run `make` and it will generate dynamic library
+ **add_reduce_op_lib.so**  which is compiled from the `add_reduce_op.cc` file. Then load such file in your python code like
+```python
+import mxnet as mx
+mx.library.load('add_reduce_op_lib.so')
+```
+
+ ### Prepare options
+ Then we need know the correct partition of parameters and gradients about their GPUs.
+ So please use **POS_Trainer** from `pos_trainer.py` like normal trainer in MXNet.
+ ```python
+from pos_trainer import POS_Trainer
+trainer = POS_Trainer(params_dict, "adam", optimizer_params)
+```
+Then trainer can generate corresponding options like:
+ ```python
+options = trainer.generate_graph_pass_options()
+backward_options = trainer.generate_backward_options()
+```
+### modify graph
+Before forward, we use 
+ ```python
+model.optimize_for(x, backend = "add_reduce_op", **options)
+```
+to insert reduce operation into graphs.   
+![example add reduce](addreduce.png)   
+
+Then we call backward option as 
+ ```python
+loss.backward(backward_option = backward_options)
+```
+### Simple Example
+Please see `test_reduce.py` 
+
+### Current problem
+1. The reduce operation will cause deadlock (it won't happen in NaiveEngine). Moreover, it will meet invalid address 
+problem in complex model like Bert-Base.
+2. We do remove outputs from backward graph using backward option. But we need to verify whether it decrease the memory 
+consumption.
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file subgraph_lib.cc
+ * \brief subgraph operator implementation library file
+ */
+
+#include <cmath>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include "mxnet/lib_api.h"
+
+using namespace mxnet::ext;
+
+
+
+MXReturnValue add_reduce_op(mxnet::ext::Graph* g,
+                     const std::unordered_map<std::string, std::string>& options) {
+  std::string cur_rank = "";
+
+  std::string num_gpus = "";
+  std::string nccl_unique_id = "";
+
+  for (auto kv : options) {
+    std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
+    if (kv.first == "rank")
+    {
+        cur_rank = kv.second.c_str();
+    }
+    if (kv.first == "nccl_unique_id")
+        nccl_unique_id = kv.second.c_str();
+    if (kv.first == "num_gpus")
+        num_gpus = kv.second.c_str();
+    }
+  size_t length = g->size();
+  mxnet::ext::Node *tmp;
+  std::string root_rank;
+  mxnet::ext::Node *target_node;
+  int index = 0;
+  for (int i = 0;i < length; i += 1)
+  {
+    target_node = g->getNode(i);
+    //std::cout<<"deal with:" << target_node->name<<std::endl;
+    auto it = options.find(target_node->name);
+    if (it == options.end()) {continue;} // req_grad == null
+    root_rank = it->second;
+    mxnet::ext::Node *new_reduce = g->addNode("ncclreduce_" + target_node->name,"_contrib_NCCLReduce");
+    index += 1;
+    auto new_attrs = &new_reduce->attrs;
+    auto old_attrs = target_node->attrs;
+    for (auto it = old_attrs.begin(); it!=old_attrs.end(); it++)
+    {
+        if (it->first == "__ext_dtype__" || it->first == "__ext_shape__" || it->first == "__profiler_scope__")
+        {
+            new_attrs ->insert({{it->first, it->second}});
+        }
+    }
+    new_attrs->insert({{"nccl_unique_id", nccl_unique_id}});
+    new_attrs->insert({{"num_gpus", num_gpus}});
+    new_attrs->insert({{"rank", cur_rank}});
+    new_attrs->insert({{"root_rank", root_rank}});
+
+  for (int i=0;i<target_node->outputs.size(); i++)
+  {
+     new_reduce->outputs.push_back(target_node->outputs[i]);
+     mxnet::ext::Node *output_node = target_node->outputs[i].node;
+     int index = target_node->outputs[i].entry;
+     //std::cout<<"try change:"<<output_node->name<<":"<<output_node->inputs.size()<<std::endl;
+     output_node->inputs[index].node = new_reduce;
+  }
+  for (int i=0;i<target_node->outputs.size(); i++)
+  {
+     target_node->outputs.pop_back();
+  }
+  target_node->outputs.push_back({new_reduce, 0});
+  new_reduce->inputs.push_back({target_node, 0});
+
+  }
+  g->print();
+
+
+  return MX_SUCCESS;
+}
+
+
+
+REGISTER_PASS(add_reduce_op).setBody(add_reduce_op);
+
+MXReturnValue initialize(int version) {
+  if (version >= 10700) {
+    std::cout << "MXNet version " << version << " supported" << std::endl;
+    return MX_SUCCESS;
+  } else {
+    MX_ERROR_MSG << "MXNet version " << version << " not supported" << std::endl;
+    return MX_FAIL;
+  }
+}