Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit logging of soft errors #245

Draft
wants to merge 8 commits into
base: split-arch
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions stratum/hal/lib/tdi/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ stratum_cc_library(
] + target_sdk_headers,
)

stratum_cc_library(
name = "tdi_soft_error",
hdrs = ["tdi_soft_error.h"],
deps = [
":tdi_status",
"//stratum/glue/status",
"//stratum/public/lib:error",
],
)

stratum_cc_library(
name = "tdi_sde_wrapper",
srcs = [
Expand Down Expand Up @@ -292,6 +302,7 @@ stratum_cc_library(
deps = [
":tdi_cc_proto",
":tdi_sde_interface",
":tdi_soft_error",
":utils",
"//stratum/glue:integral_types",
"//stratum/glue:logging",
Expand Down
3 changes: 2 additions & 1 deletion stratum/hal/lib/tdi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
add_library(stratum_tdi_common_o OBJECT
tdi_action_profile_manager.cc
tdi_action_profile_manager.h
tdi_status.h
tdi_counter_manager.cc
tdi_counter_manager.h
tdi_global_vars.cc
Expand Down Expand Up @@ -46,6 +45,8 @@ add_library(stratum_tdi_common_o OBJECT
tdi_sde_utils.h
tdi_sde_wrapper.cc
tdi_sde_wrapper.h
tdi_soft_error.h
tdi_status.h
tdi_table_manager.cc
tdi_table_manager.h
utils.cc
Expand Down
2 changes: 2 additions & 0 deletions stratum/hal/lib/tdi/es2k/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ stratum_cc_library(
hdrs = ["es2k_node.h"],
deps = [
"//stratum/hal/lib/tdi:tdi_node",
"//stratum/hal/lib/tdi:tdi_soft_error",
"@com_github_gflags_gflags//:gflags",
],
)
Expand Down Expand Up @@ -313,6 +314,7 @@ stratum_cc_test(
srcs = ["es2k_tdi_status_test.cc"],
deps = [
":test_main",
"//stratum/hal/lib/tdi:tdi_soft_error",
"//stratum/hal/lib/tdi:tdi_status",
"@com_google_googletest//:gtest",
"@local_es2k_bin//:es2k_hdrs",
Expand Down
21 changes: 9 additions & 12 deletions stratum/hal/lib/tdi/es2k/es2k_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "stratum/hal/lib/tdi/tdi_node.h"
#include "stratum/hal/lib/tdi/tdi_pipeline_utils.h"
#include "stratum/hal/lib/tdi/tdi_sde_interface.h"
#include "stratum/hal/lib/tdi/tdi_soft_error.h"
#include "stratum/lib/macros.h"
#include "stratum/lib/utils.h"
#include "stratum/public/proto/error.pb.h"
Expand Down Expand Up @@ -168,25 +169,21 @@ ::util::Status Es2kNode::WriteForwardingEntries(
RETURN_IF_ERROR(session->EndBatch());

if (!success) {
// Tally up the number of ALREADY_EXISTS errors.
int already_exists_errors = 0;
// Tally up the number of soft errors.
int soft_errors = 0;
for (const ::util::Status& status : *results) {
if (status.error_code() == ::util::error::Code::ALREADY_EXISTS) {
++already_exists_errors;
if (IsSoftError(status.error_code())) {
++soft_errors;
} else if (!status.ok()) {
already_exists_errors = 0;
soft_errors = 0;
break;
}
}
if (already_exists_errors) {
// If all the errors are ALREADY_EXISTS, downgrade severity to INFO
// and set the description to something less likely to alarm the
// customer.
const char* entries = (already_exists_errors == 1) ? "entry" : "entries";
if (soft_errors) {
return MAKE_ERROR(ERR_AT_LEAST_ONE_OPER_FAILED)
.severity(INFO)
.severity(WARNING)
.without_logging()
<< "Duplicate table " << entries << " (may not be an error)";
<< "One or more write operations had soft errors.";

} else {
return MAKE_ERROR(ERR_AT_LEAST_ONE_OPER_FAILED).without_logging()
Expand Down
31 changes: 31 additions & 0 deletions stratum/hal/lib/tdi/es2k/es2k_tdi_status_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "gtest/gtest.h"
#include "ipu_types/ipu_types.h"
#include "stratum/hal/lib/tdi/tdi_soft_error.h"
#include "stratum/hal/lib/tdi/tdi_status.h"

namespace stratum {
Expand Down Expand Up @@ -65,6 +66,36 @@ TEST_F(Es2kTdiStatusTest, failure_status_is_boolean_false) {
ASSERT_FALSE(not_okay);
}

// Verify that TDI_ALREADY_EXISTS is a soft error.
TEST_F(Es2kTdiStatusTest, tdi_already_exists_is_soft_error) {
ASSERT_TRUE(IsSoftTdiError(TDI_ALREADY_EXISTS));
}

// Verify that TDI_NO_SPACE is not a soft error.
TEST_F(Es2kTdiStatusTest, tdi_no_space_is_not_soft_error) {
ASSERT_FALSE(IsSoftTdiError(TDI_NO_SPACE));
}

// Verify that ERR_ENTRY_NOT_FOUND is a soft error.
TEST_F(Es2kTdiStatusTest, err_entry_not_found_is_soft_error) {
ASSERT_TRUE(IsSoftError(ERR_ENTRY_NOT_FOUND));
}

// Verify that ERR_UNIMPLEMENTED is not a soft error.
TEST_F(Es2kTdiStatusTest, err_unimplemented_is_not_soft_error) {
ASSERT_FALSE(IsSoftError(ERR_UNIMPLEMENTED));
}

// Verify that ALREADY_EXISTS is a soft error.
TEST_F(Es2kTdiStatusTest, already_exists_is_soft_error) {
ASSERT_TRUE(IsSoftError(::util::error::ALREADY_EXISTS));
}

// Verify that INVALID_ARGUMENT is not a soft error.
TEST_F(Es2kTdiStatusTest, invalid_argument_is_not_soft_error) {
ASSERT_FALSE(IsSoftError(::util::error::INVALID_ARGUMENT));
}

} // namespace tdi
} // namespace hal
} // namespace stratum
12 changes: 5 additions & 7 deletions stratum/hal/lib/tdi/tdi_sde_table_entry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,8 @@ ::util::Status TdiSdeWrapper::InsertTableEntry(
flags, *real_table_key->table_key_,
*real_table_data->table_data_);
if (status == TDI_ALREADY_EXISTS) {
// This is a common condition and not necessarily serious.
// Log without detail and propagate the status.
return MAKE_ERROR(::util::error::Code::ALREADY_EXISTS).severity(INFO)
<< "Duplicate table entry (may not be an error)";
return MAKE_ERROR(::util::error::Code::ALREADY_EXISTS).without_logging()
<< "Duplicate table entry";
} else if (status == TDI_NO_SPACE) {
return MAKE_ERROR(::util::error::Code::RESOURCE_EXHAUSTED)
<< "Table is already full. No space for " << dump_args();
Expand Down Expand Up @@ -150,8 +148,8 @@ ::util::Status TdiSdeWrapper::DeleteTableEntry(
flags, *real_table_key->table_key_);

if (status == TDI_OBJECT_NOT_FOUND) {
return MAKE_ERROR(::util::error::Code::NOT_FOUND)
<< "No matching table entry with " << dump_args();
return MAKE_ERROR(::util::error::Code::NOT_FOUND).without_logging()
<< "No matching table entry";
} else if (status != TDI_SUCCESS) {
return MAKE_ERROR(::util::error::Code::INTERNAL)
<< "Error deleting table entry with " << dump_args();
Expand Down Expand Up @@ -184,7 +182,7 @@ ::util::Status TdiSdeWrapper::GetTableEntry(
real_table_data->table_data_.get());

if (status == TDI_TABLE_NOT_FOUND || status == TDI_OBJECT_NOT_FOUND) {
return MAKE_ERROR(::util::error::Code::NOT_FOUND)
return MAKE_ERROR(::util::error::Code::NOT_FOUND).without_logging()
<< "No matching table entry";
} else if (status != TDI_SUCCESS) {
TdiStatus ret(status);
Expand Down
73 changes: 73 additions & 0 deletions stratum/hal/lib/tdi/tdi_soft_error.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Copyright 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#ifndef STRATUM_HAL_LIB_TDI_TDI_SOFT_ERROR_H_
#define STRATUM_HAL_LIB_TDI_TDI_SOFT_ERROR_H_

#include "stratum/glue/status/status.h"
#include "stratum/hal/lib/tdi/tdi_status.h"
#include "stratum/public/lib/error.h"

namespace stratum {
namespace hal {
namespace tdi {

//----------------------------------------------------------------------
// A Soft Error is an easily recoverable condition that the application
// may not consider to be an error at all. Common examples include:
//
// - End of File (read)
// - End of Sequence (get-next)
// - Specified Item Already Exists (add)
// - Specified Item Not Found (get, remove)
//
// We treat soft errors differently by limiting the extent to which
// they are logged as the status code is passed up the stack.
//
//----------------------------------------------------------------------

// Checks whether a TDI status code is a soft error.
//
// TDI_OBJECT_NOT_FOUND and TDI_TABLE_NOT_FOUND are both down-mapped
// to ERR_ENTRY_NOT_FOUND, so we treat them as indistinguishable.
static inline bool IsSoftTdiError(tdi_status_t err) {
return (err == TDI_ALREADY_EXISTS) || (err == TDI_OBJECT_NOT_FOUND) ||
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the definition of a soft error in this implementation? Why is "object/table not found" considered a soft error?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. When I reviewed the updated HSD-ES ticket, I discovered that it contained log messages like this:

    E20231201 00:28:53.970028 5140 tdi_table_manager.cc:624] Return Error: tdi_sde_interface_->GetTableEntry(device_, session, table_id, table_key.get(), table_data.get()) failed with StratumErrorSpace::ERR_ENTRY_NOT_FOUND: 'table->entryGet(*real_session->tdi_session_, *dev_tgt, flags, *real_table_key->table_key_, real_table_data->table_data_.get())' failed with error message: Object not found. 
    E20231201 00:28:53.970144 5140 es2k_node.cc:278] StratumErrorSpace::ERR_AT_LEAST_ONE_OPER_FAILED: One or more read operations failed.
    E20231201 00:28:53.970456 5140 p4_service.cc:353] Failed to read forwarding entries from node 1: One or more read operations failed.
    
  2. The commentary showed that they were also bothered about these issues (i.e., my previous commit had addressed only half their problem).

  3. ALREADY_EXISTS and NOT_FOUND are two sides of the same coin. The conditions are easily recoverable, which is what makes them "soft" errors.

  4. The customer is complaining about the fact that the intermediate code is logging, not just ALREADY_EXISTING, but ALREADY_EXISTING and NOT_FOUND. They are therefore complaining about our logging soft errors.

  5. When TDI status codes are converted to Stratum status codes, TDI_TABLE_NOT_FOUND and TDI_OBJECT_NOT_FOUND are both mapped to ERR_ENTRY_NOT_FOUND. I therefore treat them as identical.

  6. The idea is to reduce the amount of special-purpose code by treating all errors with this classification identically.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Point taken. I've added some additional documentation to the header file.

(err == TDI_TABLE_NOT_FOUND);
}

// Checks whether a Stratum error code is a soft error.
static inline bool IsSoftError(ErrorCode err) {
return (err == ERR_ENTRY_EXISTS) || (err == ERR_ENTRY_NOT_FOUND);
}

// Checks whether a canonical error is a soft error.
static inline bool IsSoftError(::util::error::Code err) {
return (err == ::util::error::ALREADY_EXISTS) ||
(err == ::util::error::NOT_FOUND);
}

// Checks whether the integer error_code() value of a Status object
// is a soft error.
static inline bool IsSoftError(int err) {
return IsSoftError(static_cast<::util::error::Code>(err));
}

// Special version of RETURN_IF_ERROR() that suppresses logging if this
// is a soft error.
#define RETURN_IF_ERROR_WITH_FILTERING(expr) \
do { \
const ::util::Status _status = (expr); \
if (ABSL_PREDICT_FALSE(!_status.ok())) { \
if (IsSoftError(_status.error_code())) { \
return _status; \
} \
LOG(ERROR) << "Return Error: " << #expr << " failed with " << _status; \
return _status; \
} \
} while (0)

} // namespace tdi
} // namespace hal
} // namespace stratum

#endif // STRATUM_HAL_LIB_TDI_TDI_SOFT_ERROR_H_
39 changes: 12 additions & 27 deletions stratum/hal/lib/tdi/tdi_table_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,10 @@
#include "stratum/hal/lib/p4/utils.h"
#include "stratum/hal/lib/tdi/tdi_constants.h"
#include "stratum/hal/lib/tdi/tdi_pkt_mod_meter_config.h"
#include "stratum/hal/lib/tdi/tdi_soft_error.h"
#include "stratum/hal/lib/tdi/utils.h"
#include "stratum/lib/utils.h"

// Special version of RETURN_IF_ERROR() that logs an abbreviated message
// if the status code is ALREADY_EXISTS.
#define MATCH_FIELD_RETURN_IF_ERROR(expr) \
do { \
/* Using _status below to avoid capture problems if expr is "status". */ \
const ::util::Status _status = (expr); \
if (ABSL_PREDICT_FALSE(!_status.ok())) { \
if (_status.error_code() == ::util::error::Code::ALREADY_EXISTS) { \
LOG(INFO) << "Duplicate table entry (may not be an error)"; \
return _status; \
} \
LOG(ERROR) << "Return Error: " << #expr << " failed with " << _status; \
return _status; \
} \
} while (0)

DEFINE_uint32(
tdi_table_sync_timeout_ms,
stratum::hal::tdi::kDefaultSyncTimeout / absl::Milliseconds(1),
Expand Down Expand Up @@ -406,7 +391,7 @@ ::util::StatusOr<::p4::v1::TableEntry> TdiTableManager::BuildP4TableEntry(
match.set_field_id(expected_match_field.id());
switch (expected_match_field.match_type()) {
case ::p4::config::v1::MatchField::EXACT: {
MATCH_FIELD_RETURN_IF_ERROR(table_key->GetExact(
RETURN_IF_ERROR_WITH_FILTERING(table_key->GetExact(
expected_match_field.id(), match.mutable_exact()->mutable_value()));
if (!IsDontCareMatch(match.exact())) {
*result.add_match() = match;
Expand All @@ -416,7 +401,7 @@ ::util::StatusOr<::p4::v1::TableEntry> TdiTableManager::BuildP4TableEntry(
case ::p4::config::v1::MatchField::TERNARY: {
has_priority_field = true;
std::string value, mask;
MATCH_FIELD_RETURN_IF_ERROR(
RETURN_IF_ERROR_WITH_FILTERING(
table_key->GetTernary(expected_match_field.id(), &value, &mask));
match.mutable_ternary()->set_value(value);
match.mutable_ternary()->set_mask(mask);
Expand All @@ -428,8 +413,8 @@ ::util::StatusOr<::p4::v1::TableEntry> TdiTableManager::BuildP4TableEntry(
case ::p4::config::v1::MatchField::LPM: {
std::string prefix;
uint16 prefix_length = 0;
MATCH_FIELD_RETURN_IF_ERROR(table_key->GetLpm(expected_match_field.id(),
&prefix, &prefix_length));
RETURN_IF_ERROR_WITH_FILTERING(table_key->GetLpm(
expected_match_field.id(), &prefix, &prefix_length));
match.mutable_lpm()->set_value(prefix);
match.mutable_lpm()->set_prefix_len(prefix_length);
if (!IsDontCareMatch(match.lpm())) {
Expand All @@ -440,7 +425,7 @@ ::util::StatusOr<::p4::v1::TableEntry> TdiTableManager::BuildP4TableEntry(
case ::p4::config::v1::MatchField::RANGE: {
has_priority_field = true;
std::string low, high;
MATCH_FIELD_RETURN_IF_ERROR(
RETURN_IF_ERROR_WITH_FILTERING(
table_key->GetRange(expected_match_field.id(), &low, &high));
match.mutable_range()->set_low(low);
match.mutable_range()->set_high(high);
Expand Down Expand Up @@ -551,7 +536,7 @@ ::util::Status TdiTableManager::ReadSingleTableEntry(
tdi_sde_interface_->CreateTableData(
table_id, table_entry.action().action().action_id()));
RETURN_IF_ERROR(BuildTableKey(table_entry, table_key.get()));
RETURN_IF_ERROR(tdi_sde_interface_->GetTableEntry(
RETURN_IF_ERROR_WITH_FILTERING(tdi_sde_interface_->GetTableEntry(
device_, session, table_id, table_key.get(), table_data.get()));
ASSIGN_OR_RETURN(
::p4::v1::TableEntry result,
Expand Down Expand Up @@ -619,7 +604,7 @@ ::util::Status TdiTableManager::ReadAllTableEntries(
tdi_sde_interface_->GetTdiRtId(table_entry.table_id()));
std::vector<std::unique_ptr<TdiSdeInterface::TableKeyInterface>> keys;
std::vector<std::unique_ptr<TdiSdeInterface::TableDataInterface>> datas;
RETURN_IF_ERROR(tdi_sde_interface_->GetAllTableEntries(
RETURN_IF_ERROR_WITH_FILTERING(tdi_sde_interface_->GetAllTableEntries(
device_, session, table_id, &keys, &datas));
::p4::v1::ReadResponse resp;
for (size_t i = 0; i < keys.size(); ++i) {
Expand Down Expand Up @@ -738,7 +723,7 @@ ::util::Status TdiTableManager::WriteDirectCounterEntry(
// request does not provide the action ID and data, but we have to provide the
// current values in the later modify call to the SDE, else we would modify
// the table entry.
RETURN_IF_ERROR(tdi_sde_interface_->GetTableEntry(
RETURN_IF_ERROR_WITH_FILTERING(tdi_sde_interface_->GetTableEntry(
device_, session, table_id, table_key.get(), table_data.get()));

// P4RT spec requires that the referenced table entry must exist. Therefore we
Expand Down Expand Up @@ -787,7 +772,7 @@ ::util::Status TdiTableManager::WriteDirectMeterEntry(
// request does not provide the action ID and data, but we have to provide the
// current values in the later modify call to the SDE, else we would modify
// the table entry.
RETURN_IF_ERROR(tdi_sde_interface_->GetTableEntry(
RETURN_IF_ERROR_WITH_FILTERING(tdi_sde_interface_->GetTableEntry(
device_, session, table_id, table_key.get(), table_data.get()));

// P4RT spec requires that the referenced table entry must exist. Therefore we
Expand Down Expand Up @@ -862,7 +847,7 @@ TdiTableManager::ReadDirectCounterEntry(
device_, session, table_id,
absl::Milliseconds(FLAGS_tdi_table_sync_timeout_ms)));

RETURN_IF_ERROR(tdi_sde_interface_->GetTableEntry(
RETURN_IF_ERROR_WITH_FILTERING(tdi_sde_interface_->GetTableEntry(
device_, session, table_id, table_key.get(), table_data.get()));

// TODO(max): build response entry from returned data
Expand Down Expand Up @@ -900,7 +885,7 @@ TdiTableManager::ReadDirectMeterEntry(
RETURN_IF_ERROR(BuildTableKey(table_entry, table_key.get()));
}

RETURN_IF_ERROR(tdi_sde_interface_->GetTableEntry(
RETURN_IF_ERROR_WITH_FILTERING(tdi_sde_interface_->GetTableEntry(
device_, session, table_id, table_key.get(), table_data.get()));

ASSIGN_OR_RETURN(auto table, p4_info_manager_->FindTableByID(table_id));
Expand Down