Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#453 from graphcore/merge_develop
Browse files Browse the repository at this point in the history
Merge develop
  • Loading branch information
gglin001 authored Feb 18, 2022
2 parents 10e7953 + c5421eb commit cfe4f6c
Show file tree
Hide file tree
Showing 464 changed files with 11,599 additions and 5,711 deletions.
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@ paddle/fluid/operators/distributed/send_recv.proto
paddle/fluid/API.spec
paddle/fluid/API_DEV.spec
paddle/fluid/API_PR.spec
paddle/fluid/eager/api/generated/*
paddle/fluid/op_use_default_grad_maker_DEV.spec
paddle/fluid/op_use_default_grad_maker_PR.spec
paddle/pten/api/backward/backward_api.h
paddle/pten/api/include/api.h
paddle/pten/api/lib/api.cc
paddle/pten/api/backward/backward_api.h
paddle/pten/api/lib/dygraph_api.*
paddle/pten/api/lib/backward_api.cc
paddle/pten/extension.h
paddle/pten/include/*
paddle/pten/infermeta/generated.*
paddle/pten/extension.h
paddle/fluid/eager/api/generated/*

*.DS_Store
*.vs
Expand Down Expand Up @@ -51,5 +52,6 @@ paddle/infrt/dialect/pd_ops_info.h
.lit_test_times.txt
paddle/infrt/tests/dialect/Output
paddle/infrt/tests/lit.cfg.py
paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc
paddle/fluid/pybind/eager_final_state_op_function_impl.h
paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ ENDIF()

if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220119")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
Expand Down
131 changes: 29 additions & 102 deletions paddle/fluid/distributed/fleet_executor/dist_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
} else if (input_data.dtype == DistModelDataType::INT32) {
input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
} else {
// Q(fleet exe dev): for input/output, should we support fp16
LOG(ERROR) << "unsupported feed type " << input_data.dtype;
return false;
}
Expand Down Expand Up @@ -113,14 +112,6 @@ std::string DistModelDTypeToString(DistModelDataType dtype) {
return "NOT SUPPORT DTYPE";
}

bool IsPPFirstStage(const DistModelConfig &config) {
return config.local_rank - config.mp_degree < 0;
}

bool IsPPLastStage(const DistModelConfig &config) {
return config.local_rank + config.mp_degree >= config.nranks;
}

class DistModelTimer {
public:
void tic() { tic_time = std::chrono::high_resolution_clock::now(); }
Expand Down Expand Up @@ -197,65 +188,34 @@ bool DistModel::PreparePlace() {
}

bool DistModel::CommInit() {
// NOTE (Yuang Liu): The peer endpoints will be obtained with the assumption
// that mp part is always on inner side and pp part is always on outer side.
// TODO(fleet exe dev): The peer endpoints could be configured by users.
PADDLE_ENFORCE_EQ(
config_.pp_degree * config_.mp_degree, config_.nranks,
platform::errors::InvalidArgument(
"The mp_degree multiplies pp_degree is not equal with nranks"));
std::unique_ptr<framework::ProgramDesc> comm_init_program(
new framework::ProgramDesc());
framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0);
if (config_.mp_degree > 1) {
PADDLE_ENFORCE_GE(
config_.mp_ring_id, 0,
platform::errors::InvalidArgument(
"mp ring id must be provided for inference under mp."));
VLOG(3) << "Init comm group for mp.";
std::vector<int64_t> &ring_ids =
config_.rank_to_ring_ids_[config_.local_rank];
int64_t order = 0;
std::string var_name_base = "comm_init_";
for (int64_t ring_id : ring_ids) {
VLOG(3) << "Init comm for ring id: " << ring_id;
int64_t ranks_in_group = config_.ring_id_to_ranks_[ring_id].size();
int64_t rank_in_group = 0;
std::vector<int64_t> &ranks = config_.ring_id_to_ranks_[ring_id];
for (int64_t rank : ranks) {
if (config_.local_rank == rank) {
break;
}
rank_in_group += 1;
}
std::vector<std::string> peer_endpoints;
for (int64_t
idx = (config_.local_rank / config_.mp_degree) * config_.mp_degree,
i = 0;
i < config_.mp_degree; ++idx, ++i) {
if (config_.trainer_endpoints[idx] == config_.current_endpoint) {
for (int64_t rank : ranks) {
if (config_.local_rank == rank) {
continue;
}
peer_endpoints.emplace_back(config_.trainer_endpoints[idx]);
}
// get nranks in a mp group and inner group rank for local rank
int64_t mp_group_nranks = config_.nranks / config_.pp_degree;
int64_t mp_group_rank = config_.local_rank % config_.mp_degree;
InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints,
comm_init_block, config_.mp_ring_id);
}
if (config_.pp_degree > 1) {
VLOG(3) << "Init comm group for pp.";
if (!IsPPFirstStage(config_)) {
PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true,
platform::errors::InvalidArgument(
"pp upstream ring id must be provided for "
"non-first pp stage if inference under pp."));
// not the first pp stage, has upstream
std::vector<std::string> upstream_peer_endpoints;
upstream_peer_endpoints.emplace_back(
config_.trainer_endpoints[config_.local_rank - config_.mp_degree]);
InsertCommOp("pp_upstream_comm_id", 2, 1, upstream_peer_endpoints,
comm_init_block, config_.pp_upstream_ring_id);
}

if (!IsPPLastStage(config_)) {
PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true,
platform::errors::InvalidArgument(
"pp downstream ring id must be provided for "
"non-last pp stage if inference under pp."));
// not the last pp stage, has downstream
std::vector<std::string> downstream_peer_endpoints;
downstream_peer_endpoints.emplace_back(
config_.trainer_endpoints[config_.local_rank + config_.mp_degree]);
InsertCommOp("pp_downstream_comm_id", 2, 0, downstream_peer_endpoints,
comm_init_block, config_.pp_downstream_ring_id);
peer_endpoints.emplace_back(config_.trainer_endpoints[rank]);
}
InsertCommOp(var_name_base + std::to_string(order), ranks_in_group,
rank_in_group, peer_endpoints, comm_init_block, ring_id);
order += 1;
}
framework::NaiveExecutor e(place_);
e.CreateVariables(*comm_init_program, 0, true, scope_.get());
Expand Down Expand Up @@ -409,12 +369,7 @@ bool DistModel::LoadParameters() {

bool DistModel::PrepareFleetExe() {
task_node_.reset(new TaskNode(program_.get(), config_.local_rank));
if (config_.local_rank - config_.mp_degree >= 0) {
task_node_->AddUpstreamTask(config_.local_rank - config_.mp_degree);
}
if (config_.local_rank + config_.mp_degree < config_.nranks) {
task_node_->AddDownstreamTask(config_.local_rank + config_.mp_degree);
}
// With auto cut, there is no concept of pp, no need to add dependency.
task_node_->SetType("Compute");
task_node_->Init();
executor_desc_ = FleetExecutorDesc();
Expand Down Expand Up @@ -473,40 +428,13 @@ bool DistModel::PrepareFeedAndFetch() {
}
}

if (config_.pp_degree == 1) {
if (feeds_.size() == 0) {
LOG(ERROR) << "No feed ops in the inf program, please check the program.";
return false;
}
if (fetches_.size() == 0) {
LOG(ERROR) << "No fetch op in the inf program, please check the program.";
return false;
}
} else {
if (IsPPFirstStage(config_)) {
if (feeds_.size() == 0) {
LOG(ERROR) << "Feed ops are needed for the first pp stage.";
return false;
}
} else {
if (feeds_.size() > 0) {
LOG(WARNING) << "Feed op is found in the non-first stage of pp.";
} else {
LOG(INFO) << "No feed ops in non-first pp stage.";
}
}
if (IsPPLastStage(config_)) {
if (fetches_.size() == 0) {
LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure "
"the result has been sent to frist pp stage.";
}
} else {
if (fetches_.size() > 0) {
LOG(WARNING) << "Fetch op is found in the non-last stage of pp.";
} else {
LOG(INFO) << "No fetch op in non-last pp stage.";
}
}
if (feeds_.size() == 0) {
LOG(ERROR) << "No feed ops in the inf program, please check the program.";
return false;
}
if (fetches_.size() == 0) {
LOG(ERROR) << "No fetch op in the inf program, please check the program.";
return false;
}
return true;
}
Expand Down Expand Up @@ -606,7 +534,6 @@ bool DistModel::FetchResult(const framework::LoDTensor &fetch,

bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
std::vector<DistModelTensor> *output_data) {
// TODO(fleet exe dev): support pipeline inf mode
VLOG(3) << "DistModel run for once.";

DistModelTimer timer;
Expand Down
8 changes: 3 additions & 5 deletions paddle/fluid/distributed/fleet_executor/dist_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
Expand Down Expand Up @@ -47,12 +48,9 @@ struct DistModelConfig {
std::string current_endpoint{};
int64_t nranks{1};
int64_t local_rank{0};
int64_t mp_degree{1};
int64_t pp_degree{1};
int64_t mp_ring_id{-1};
int64_t pp_upstream_ring_id{-1};
int64_t pp_downstream_ring_id{-1};
bool enable_timer{false};
std::map<int64_t, std::vector<int64_t>> ring_id_to_ranks_{};
std::map<int64_t, std::vector<int64_t>> rank_to_ring_ids_{};
};

class DistModel {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"

USE_OP(elementwise_add);
USE_OP_ITSELF(elementwise_add);
USE_OP(fill_constant);

namespace paddle {
Expand Down
33 changes: 18 additions & 15 deletions paddle/fluid/eager/auto_code_generator/eager_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1227,11 +1227,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
// Forward Function Body
// According to fwd_inputs_name_pos_map
std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>
ins =
{ {"X" , TrySyncToVars(X)}, { "Y" , TrySyncToVars(Y)} };
std::map<std::string, std::vector<std::shared_ptr<egr::EagerTensor>>>
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>>
outs =
{
{"Out0" , CreateVars(Out0Num)}, {"Out1"
Expand Down Expand Up @@ -1316,7 +1316,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(

const char* FWD_INS_MAP_TEMPLATE =
" std::map<std::string, "
"std::vector<std::shared_ptr<egr::EagerTensor>>> ins = { "
"std::vector<std::shared_ptr<egr::EagerVariable>>> ins = { "
"%s };\n";
std::string ins_map_str =
paddle::string::Sprintf(FWD_INS_MAP_TEMPLATE, ins_contents_str);
Expand Down Expand Up @@ -1353,8 +1353,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
if (op_passing_outs_map[op_type].count(output_name)) {
const std::string output_var_name = output_name + "Var";

// Pass Output from function argument(EagerTensor*/vector<EagerTensor*>&),
// in form of shared_ptr<EagerTensor>/vector<shared_ptr<EagerTensor>>
// Pass Output from function
// argument(EagerVariable*/vector<EagerVariable*>&),
// in form of shared_ptr<EagerVariable>/vector<shared_ptr<EagerVariable>>
if (output.duplicable()) {
const char* FWD_NUM_ARG_TEMPLATE =
", std::vector<paddle::experimental::Tensor*>& %s";
Expand Down Expand Up @@ -1395,7 +1396,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
} else {
const char* FWD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", "
"{std::make_shared<egr::EagerTensor>(egr::Controller::Instance()."
"{std::make_shared<egr::EagerVariable>(egr::Controller::Instance()."
"GenerateUniqueName())}},";
outs_contents_str +=
paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name);
Expand All @@ -1407,7 +1408,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(

const char* FWD_OUTS_MAP_TEMPLATE =
" std::map<std::string, "
"std::vector<std::shared_ptr<egr::EagerTensor>>> outs = { "
"std::vector<std::shared_ptr<egr::EagerVariable>>> outs = { "
"%s };\n";
std::string outs_map_str =
paddle::string::Sprintf(FWD_OUTS_MAP_TEMPLATE, outs_contents_str);
Expand Down Expand Up @@ -1482,7 +1483,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
generated_function_body += out_tensor_str;
}
generated_function_body += "\n";
VLOG(6) << "Converted Output VarBase to EagerTensor(s)";
VLOG(6) << "Converted Output VarBase to EagerVariable(s)";

// [Generation] Handle core_ops_returns_info
core_ops_returns_info[op_type] = return_contents;
Expand Down Expand Up @@ -1611,7 +1612,7 @@ static std::string GenerateSingleOpBase(
size_t fwd_output_position = fwd_outputs_name_pos_map.at(
grad_ins_grad_slotname_map.at(grad_input_name));
const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::TrySyncToVars(grads[%d]) },";
"{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
ins_contents_str += paddle::string::Sprintf(
GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);

Expand All @@ -1627,7 +1628,7 @@ static std::string GenerateSingleOpBase(

const char* BWD_INS_MAP_TEMPLATE =
" std::map<std::string, "
"std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
"std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
"%s };\n";
std::string ins_map_str =
paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
Expand Down Expand Up @@ -1688,7 +1689,7 @@ static std::string GenerateSingleOpBase(
size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);

const char* GRAD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", egr::EagerUtils::TrySyncToVars(grads[%d]) },";
"{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
outs_contents_str += paddle::string::Sprintf(
GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);

Expand All @@ -1704,7 +1705,7 @@ static std::string GenerateSingleOpBase(
} else {
const char* GRAD_OUTS_CONTENT_TEMPLATE =
"{ \"%s\", "
"{std::make_shared<egr::EagerTensor>(egr::Controller::Instance("
"{std::make_shared<egr::EagerVariable>(egr::Controller::Instance("
")."
"GenerateUniqueName())}},";
outs_contents_str += paddle::string::Sprintf(
Expand All @@ -1723,7 +1724,7 @@ static std::string GenerateSingleOpBase(

const char* BWD_OUTS_MAP_TEMPLATE =
" std::map<std::string, "
"std::vector<std::shared_ptr<egr::EagerTensor>>> %s = { "
"std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
"%s };\n";
std::string outs_map_str = paddle::string::Sprintf(
BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
Expand Down Expand Up @@ -1848,9 +1849,9 @@ static std::string GenerateGradNodeCCContents(
{
"X" : this->"X", "Y" : this->"Y",
"Out0@Grad":
TrySyncToVars(grads["fwd_outputs_name_pos_map[grad_ins_grad_slotname_map["Out0@Grad"]]"]),
TrySyncToVars(hooked_grads["fwd_outputs_name_pos_map[grad_ins_grad_slotname_map["Out0@Grad"]]"]),
"Out1@Grad":
TensorsToVarBases(grads["fwd_outputs_name_pos_map[grad_ins_grad_slotname_map["Out1@Grad"]]"])
TensorsToVarBases(hooked_grads["fwd_outputs_name_pos_map[grad_ins_grad_slotname_map["Out1@Grad"]]"])
};
// Comes from "grad_outs"
Expand Down Expand Up @@ -1934,6 +1935,8 @@ static std::string GenerateGradNodeCCContents(
}

const char* BWD_RETURN_TEMPLATE =
" std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
"egr::GradNodeBase::ApplyGradientHooks(grads);\n"
" std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
" %s\n"
" return outputs;\n";
Expand Down
Loading

0 comments on commit cfe4f6c

Please sign in to comment.