diff --git a/maga_transformer/cpp/models/GptModel.cc b/maga_transformer/cpp/models/GptModel.cc index 27a67f3b..2dc220c5 100644 --- a/maga_transformer/cpp/models/GptModel.cc +++ b/maga_transformer/cpp/models/GptModel.cc @@ -111,6 +111,37 @@ void GptModel::prepareAttentionInputs( attention_inputs.attention_mask = inputs.attention_mask; } + +/* + * ┌───────────┐ + * │ hidden │ + * └─────┬─────┘ + * │ + * │ + * ┌───────▼───────┐ + * │ pre_layernorm?├─────────┐ + * └───────┬───────┘ │ + * │ │ + * ┌─────▼─────┐ │ + * │ attention │ │ + * └─────┬─────┘ │ + * │ │ + * ┌───────▼───────┐ │ + * ┌──────┤post_attn_norm?◄─────────┘ + * │ └───────┬───────┘ + * │ │ + * │ ┌────▼────┐ + * │ │ mlp │ + * │ └────┬────┘ + * │ │ + * │ ┌────▼────┐ + * └─────────► add │ + * └────┬────┘ + * │ + * ┌─────▼─────┐ + * │ layernorm │ + * └───────────┘ + */ GptModelOutputs GptModel::forward(const GptModelInputs& inputs) { const auto norm_type = description_.norm_type; const auto norm_eps = description_.layernorm_eps; @@ -163,6 +194,7 @@ GptModelOutputs GptModel::forward(const GptModelInputs& inputs) { auto attn_out_buf = device_->allocateBuffer({hidden->type(), hidden->shape()}, {"attn_out_buf"}); auto residual = hidden; + BufferPtr residual2 = nullptr; if (layer.pre_layernorm) { residual = device_->clone({*hidden, AllocationType::DEVICE, {"residual"}}); device_->layernorm(LayernormParams( @@ -205,7 +237,7 @@ GptModelOutputs GptModel::forward(const GptModelInputs& inputs) { residual = attn_hidden; } } else { - hidden = move(attn_hidden); + residual2 = attn_hidden; } printBufferData(*hidden, "layer_" + to_string(i) + "_ffn_input"); @@ -226,7 +258,8 @@ GptModelOutputs GptModel::forward(const GptModelInputs& inputs) { *hidden, *hidden, nullopt, norm_type, ft::mayGetRef(layer.post_ffn_layernorm), norm_eps, device_props_.ffn_fuse_add_residual ? nullopt : (OptionalConstBufferRef)*residual, - nullopt, ft::mayGetRef(layer.ffn_weights.down_weight->bias))); + (residual2 == nullptr) ? nullopt : (OptionalConstBufferRef)*residual2, + ft::mayGetRef(layer.ffn_weights.down_weight->bias))); printBufferData(*hidden, "layer_" + to_string(i) + "_final_hidden"); } diff --git a/src/fastertransformer/devices/base_impl/FfnLayer.cc b/src/fastertransformer/devices/base_impl/FfnLayer.cc index 161e75d2..650750a7 100644 --- a/src/fastertransformer/devices/base_impl/FfnLayer.cc +++ b/src/fastertransformer/devices/base_impl/FfnLayer.cc @@ -1,6 +1,6 @@ #include "src/fastertransformer/devices/DeviceBase.h" #include "src/fastertransformer/devices/OpData.h" - +#include "src/fastertransformer/devices/utils/DebugUtils.h" using namespace std; namespace fastertransformer { @@ -46,7 +46,7 @@ FfnLayerOutput DeviceBase::ffnLayer(const FfnLayerParams& params) { std::nullopt, *(params.weights.up_weight), std::nullopt}); - + printBufferData(*up_output.output, "ffn_up"); if (FFNDispatch::dispatch(params) == FFNDispatch::FFNType::Gate) { { auto gate_output = loraLinear({params.input, @@ -73,11 +73,12 @@ FfnLayerOutput DeviceBase::ffnLayer(const FfnLayerParams& params) { mayGetRef(params.weights.up_weight->bias), std::nullopt, std::nullopt}); - + printBufferData(*up_output.output, "ffn_act"); auto output = loraLinear({*(up_output.output), std::nullopt, *(params.weights.down_weight), std::nullopt}); + printBufferData(*output.output, "ffn_out"); return FfnLayerOutput({move(output.output)}); } else { throw OpException(OpErrorType::ERROR_UNIMPLEMENTED); diff --git a/src/fastertransformer/devices/cuda_impl/CudaLayernorm.cc b/src/fastertransformer/devices/cuda_impl/CudaLayernorm.cc index 5c7dbef9..7b25458b 100644 --- a/src/fastertransformer/devices/cuda_impl/CudaLayernorm.cc +++ b/src/fastertransformer/devices/cuda_impl/CudaLayernorm.cc @@ -36,7 +36,7 @@ LayernormOutput CudaDevice::layernorm(const LayernormParams& params) { n, stream_ ); - } else if (params.bias.has_value() || params.residual1.has_value()) { + } else if (params.bias.has_value() || params.residual1.has_value() || params.residual2.has_value()) { DISPATCH_CUDA_FUNCTION_DATA_TYPE(data_type, invokeAddBiasResidual, output.data(), input.data(),