Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NBSOPSNEBIUS-25: report ExternalEndpointUnexpectedExit on vhost-server crashes #288

Merged
merged 2 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloud/blockstore/config/server.proto
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ message TServerConfig

// Path to vhost server executable.
optional string VhostServerPath = 108;

// Additional vhost server command line arguments
// e.g. ["--pid-file", "/tmp/test-vhost-server-pid", "--verbose", "debug"]
repeated string VhostServerExtArgs = 109;
budevg marked this conversation as resolved.
Show resolved Hide resolved
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/daemon/common/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ void TBootstrapBase::Init()
ServerStats,
Executor,
Configs->ServerConfig->GetVhostServerPath(),
Configs->ServerConfig->GetVhostServerExtArgs(),
Configs->Options->SkipDeviceLocalityValidation
? TString {}
: FQDNHostName(),
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace NCloud::NBlockStore {
xxx(DiskRegistryDeviceNotFoundSoft) \
xxx(DiskRegistrySourceDiskNotFound) \
xxx(EndpointSwitchFailure) \
xxx(ExternalEndpointUnexpectedExit) \
xxx(DiskAgentSessionCacheUpdateError) \
xxx(DiskAgentSessionCacheRestoreError) \
// BLOCKSTORE_CRITICAL_EVENTS
Expand Down
17 changes: 15 additions & 2 deletions cloud/blockstore/libs/endpoints_vhost/external_vhost_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "external_endpoint_stats.h"

#include <cloud/blockstore/libs/common/device_path.h>
#include <cloud/blockstore/libs/diagnostics/critical_events.h>
#include <cloud/blockstore/libs/diagnostics/server_stats.h>
#include <cloud/blockstore/libs/endpoints/endpoint_listener.h>

Expand Down Expand Up @@ -485,7 +486,10 @@ class TEndpoint final
break;
}

// TODO: limiter
ReportExternalEndpointUnexpectedExit(TStringBuilder()
sharpeye marked this conversation as resolved.
Show resolved Hide resolved
<< "External endpoint for a disk " << Stats.DiskId.Quote()
<< " and a client " << Stats.ClientId.Quote()
<< " unexpectedly stopped: " << FormatError(error));

auto process = RestartProcess();
if (!process) {
Expand Down Expand Up @@ -943,15 +947,24 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener(
IServerStatsPtr serverStats,
TExecutorPtr executor,
TString binaryPath,
TVector<TString> extArgs,
sharpeye marked this conversation as resolved.
Show resolved Hide resolved
TString localAgentId,
IEndpointListenerPtr fallbackListener)
{
auto defaultFactory = [=] (
auto defaultFactory = [
logging,
serverStats,
executor,
binaryPath = std::move(binaryPath),
extArgs = std::move(extArgs)
] (
const TString& clientId,
const TString& diskId,
TVector<TString> args,
TVector<TString> cgroups)
{
args.insert(args.begin(), extArgs.begin(), extArgs.end());

return std::make_shared<TEndpoint>(
clientId,
logging,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener(
IServerStatsPtr serverStats,
TExecutorPtr executor,
TString binaryPath,
TVector<TString> extArgs,
TString localAgentId,
IEndpointListenerPtr fallbackListener);

Expand Down
9 changes: 3 additions & 6 deletions cloud/blockstore/libs/server/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ constexpr TDuration Seconds(int s)
NCloud::NProto::ENDPOINT_STORAGE_KEYRING )\
xxx(EndpointStorageDir, TString, {} )\
xxx(VhostServerPath, TString, {} )\
xxx(VhostServerExtArgs, TVector<TString>, {} )\
// BLOCKSTORE_SERVER_CONFIG

#define BLOCKSTORE_SERVER_DECLARE_CONFIG(name, type, value) \
Expand Down Expand Up @@ -116,19 +117,15 @@ template <>
TVector<TString> ConvertValue(
const google::protobuf::RepeatedPtrField<TString>& value)
{
TVector<TString> v;
for (const auto& x : value) {
v.push_back(x);
}
return v;
return { value.begin(), value.end() };
}

template <>
TVector<TCertificate> ConvertValue(
const google::protobuf::RepeatedPtrField<NCloud::NProto::TCertificate>& value)
{
TVector<TCertificate> v;
for (const auto& x : value) {
for (const auto& x: value) {
v.push_back({x.GetCertFile(), x.GetCertPrivateKeyFile()});
}
return v;
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/server/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ class TServerAppConfig
NCloud::NProto::EEndpointStorageType GetEndpointStorageType() const;
TString GetEndpointStorageDir() const;
TString GetVhostServerPath() const;
TVector<TString> GetVhostServerExtArgs() const;

void Dump(IOutputStream& out) const override;
void DumpHtml(IOutputStream& out) const override;
Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/public/sdk/python/client/base_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
"execute_action",
"kick_endpoint",
"cms_action",
"update_disk_registry_config",
"describe_disk_registry_config",
]


Expand Down
68 changes: 68 additions & 0 deletions cloud/blockstore/public/sdk/python/client/safe_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import cloud.blockstore.public.sdk.python.protos as protos

from google.protobuf.json_format import ParseDict

from .error import _handle_errors


Expand Down Expand Up @@ -901,3 +903,69 @@ def cms_action(
trace_id,
request_timeout)
return response

@_handle_errors
def update_disk_registry_config_async(
self,
config,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

request = ParseDict(config, protos.TUpdateDiskRegistryConfigRequest())

return self.__impl.update_disk_registry_config_async(
request,
idempotence_id,
timestamp,
trace_id,
request_timeout)

@_handle_errors
def update_disk_registry_config(
self,
config,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

request = ParseDict(config, protos.TUpdateDiskRegistryConfigRequest())

return self.__impl.update_disk_registry_config(
request,
idempotence_id,
timestamp,
trace_id,
request_timeout)

@_handle_errors
def describe_disk_registry_config_async(
self,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

return self.__impl.describe_disk_registry_config_async(
protos.TDescribeDiskRegistryConfigRequest(),
idempotence_id,
timestamp,
trace_id,
request_timeout)

@_handle_errors
def describe_disk_registry_config(
self,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

return self.__impl.describe_disk_registry_config(
protos.TDescribeDiskRegistryConfigRequest(),
idempotence_id,
timestamp,
trace_id,
request_timeout)
Loading
Loading