Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[5.0.3] P2P: Resolve on reconnect #2408

Merged
merged 3 commits into from
Sep 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 44 additions & 31 deletions plugins/net_plugin/net_plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,6 @@ namespace eosio {
std::string host;
connection_ptr c;
tcp::endpoint active_ip;
tcp::resolver::results_type ips;
};

using connection_details_index = multi_index_container<
Expand Down Expand Up @@ -416,9 +415,9 @@ namespace eosio {

void add(connection_ptr c);
string connect(const string& host, const string& p2p_address);
string resolve_and_connect(const string& host, const string& p2p_address);
string resolve_and_connect(const string& host, const string& p2p_address, const connection_ptr& c = {});
void update_connection_endpoint(connection_ptr c, const tcp::endpoint& endpoint);
void connect(const connection_ptr& c);
void reconnect(const connection_ptr& c);
string disconnect(const string& host);
void close_all();

Expand Down Expand Up @@ -912,7 +911,7 @@ namespace eosio {

fc::sha256 conn_node_id;
string short_conn_node_id;
string listen_address; // address sent to peer in handshake
const string listen_address; // address sent to peer in handshake
string log_p2p_address;
string log_remote_endpoint_ip;
string log_remote_endpoint_port;
Expand Down Expand Up @@ -2764,16 +2763,21 @@ namespace eosio {
fc_dlog( logger, "Skipping connect due to go_away reason ${r}",("r", reason_str( no_retry )));
return false;
}

if (incoming())
return false;

if( consecutive_immediate_connection_close > def_max_consecutive_immediate_connection_close || no_retry == benign_other ) {
fc::microseconds connector_period = my_impl->connections.get_connector_period();
fc::lock_guard g( conn_mtx );
if( last_close == fc::time_point() || last_close > fc::time_point::now() - connector_period ) {
return true; // true so doesn't remove from valid connections
}
}

connection_ptr c = shared_from_this();
strand.post([c]() {
my_impl->connections.connect(c);
my_impl->connections.reconnect(c);
});
return true;
}
Expand Down Expand Up @@ -4486,39 +4490,48 @@ namespace eosio {
return resolve_and_connect( host, p2p_address );
}

string connections_manager::resolve_and_connect( const string& peer_address, const string& listen_address ) {
string connections_manager::resolve_and_connect( const string& peer_address, const string& listen_address,
const connection_ptr& c )
{
assert(!c || (c->peer_address() == peer_address && c->listen_address == listen_address));

string::size_type colon = peer_address.find(':');
if (colon == std::string::npos || colon == 0) {
fc_elog( logger, "Invalid peer address. must be \"host:port[:<blk>|<trx>]\": ${p}", ("p", peer_address) );
return "invalid peer address";
}

std::lock_guard g( connections_mtx );
if( find_connection_i( peer_address ) )
return "already connected";
{
std::lock_guard g(connections_mtx);
if (find_connection_i(peer_address))
return "already connected";
}

auto [host, port, type] = split_host_port_type(peer_address);

auto resolver = std::make_shared<tcp::resolver>( my_impl->thread_pool.get_executor() );

resolver->async_resolve(host, port,
[resolver, host = host, port = port, peer_address = peer_address, listen_address = listen_address, this]( const boost::system::error_code& err, const tcp::resolver::results_type& results ) {
connection_ptr c = std::make_shared<connection>( peer_address, listen_address );
c->set_heartbeat_timeout( heartbeat_timeout );
std::lock_guard g( connections_mtx );
auto [it, inserted] = connections.emplace( connection_detail{
.host = peer_address,
.c = std::move(c),
.ips = results
[this, resolver, c_org{c}, host, port, peer_address, listen_address]( const boost::system::error_code& err, const tcp::resolver::results_type& results ) {
connection_ptr c = c_org ? c_org : std::make_shared<connection>( peer_address, listen_address );
c->strand.post([this, resolver, c, err, results, host, port, peer_address]() {
c->set_heartbeat_timeout( heartbeat_timeout );
{
std::lock_guard g( connections_mtx );
connections.emplace( connection_detail{
.host = peer_address,
.c = c,
});
}
if( !err ) {
c->connect( results );
} else {
fc_wlog( logger, "Unable to resolve ${host}:${port} ${error}",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fwiw this is a warning log, where failing to connect is an info log. Not sure if you want to make consistent or not

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the async_connect fails it calls c->close(false) which calls _close() which increments consecutive_immediate_connection_close.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a warn is appropriate if unable to resolve.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the async_connect fails it calls c->close(false) which calls _close() which increments consecutive_immediate_connection_close.

There is something different between the two though. For example when I run nodeos with --p2p-peer-address fdsfdsfds.invalid:4444 --p2p-peer-address fddsfdsfdsfds.invalid:4444 --p2p-peer-address 127.0.0.1:2121 --p2p-peer-address www.google.com:4444 you'll see how the latter 2 will try to reconnect forever where the first 2 won't.

There is also some log oddness at shutdown,

info  2024-09-26T02:45:43.993 nodeos    net_plugin.cpp:4578           close_all            ] close all 4 connections
info  2024-09-26T02:45:43.993 net-1     net_plugin.cpp:1460           _close               ] ["127.0.0.1:2121" - 1 <unknown>:<unknown>] closing
info  2024-09-26T02:45:43.993 net-2     net_plugin.cpp:1460           _close               ] ["fddsfdsfdsfds.invalid:4444" - 2 <unknown>:<unknown>] closing
info  2024-09-26T02:45:43.993 nodeos    net_plugin.cpp:4385           plugin_shutdown      ] exit shutdown

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This different in backoff behavior seems to match Leap 4.0 behavior.

("host", host)("port", port)( "error", err.message() ) );
c->set_state(connection::connection_state::closed);
++(c->consecutive_immediate_connection_close);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does increasing this do? I thought it was going to cause some sort of cool down when reaching def_max_consecutive_immediate_connection_close but doesn't seem to matter

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to look into it, but I think the change that broke this re-connect also broke this back off of consecutive_immediate_connection_close

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, actually this a reason to reuse the connection object. I guess I should maintain the reuse of the connection object for this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like now (on 51fd8fa) after failing to resolve a host def_max_consecutive_immediate_connection_close times it just never tries it again. But if the resolve completes and simply unable to connect to the remote host, it will try retry indefinitely. I can't tell if this discrepancy is intentional or not. It makes me wonder if we should not be increasing this counter here for consistency between the two, but realistically if someone has a completely bad hostname it's probably not fixing itself.

}
});
if( !err ) {
it->c->connect( results );
} else {
fc_wlog( logger, "Unable to resolve ${host}:${port} ${error}",
("host", host)("port", port)( "error", err.message() ) );
it->c->set_state(connection::connection_state::closed);
++(it->c->consecutive_immediate_connection_close);
}
} );

return "added connection";
Expand All @@ -4536,13 +4549,13 @@ namespace eosio {
}
}

void connections_manager::connect(const connection_ptr& c) {
std::lock_guard g( connections_mtx );
const auto& index = connections.get<by_connection>();
const auto& it = index.find(c);
if( it != index.end() ) {
it->c->connect( it->ips );
void connections_manager::reconnect(const connection_ptr& c) {
{
std::lock_guard g( connections_mtx );
auto& index = connections.get<by_connection>();
index.erase(c);
}
resolve_and_connect(c->peer_address(), c->listen_address, c);
}

// called by API
Expand Down
Loading