Skip to content

Commit

Permalink
NBSOPSNEBIUS-25: report ExternalEndpointUnexpectedExit on vhost-serve…
Browse files Browse the repository at this point in the history
…r crashes
  • Loading branch information
sharpeye committed Jan 30, 2024
1 parent 5097d0f commit 7057fee
Show file tree
Hide file tree
Showing 14 changed files with 361 additions and 18 deletions.
4 changes: 4 additions & 0 deletions cloud/blockstore/config/server.proto
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ message TServerConfig

// Path to vhost server executable.
optional string VhostServerPath = 108;

// Additional vhost server command line arguments
// e.g. ["--pid-file", "/tmp/test-vhost-server-pid", "--verbose", "debug"]
repeated string VhostServerExtArgs = 109;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/daemon/common/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ void TBootstrapBase::Init()
ServerStats,
Executor,
Configs->ServerConfig->GetVhostServerPath(),
Configs->ServerConfig->GetVhostServerExtArgs(),
Configs->Options->SkipDeviceLocalityValidation
? TString {}
: FQDNHostName(),
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace NCloud::NBlockStore {
xxx(DiskRegistryDeviceNotFoundSoft) \
xxx(DiskRegistrySourceDiskNotFound) \
xxx(EndpointSwitchFailure) \
xxx(ExternalEndpointUnexpectedExit) \
xxx(DiskAgentSessionCacheUpdateError) \
xxx(DiskAgentSessionCacheRestoreError) \
// BLOCKSTORE_CRITICAL_EVENTS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "external_endpoint_stats.h"

#include <cloud/blockstore/libs/common/device_path.h>
#include <cloud/blockstore/libs/diagnostics/critical_events.h>
#include <cloud/blockstore/libs/diagnostics/server_stats.h>
#include <cloud/blockstore/libs/endpoints/endpoint_listener.h>

Expand Down Expand Up @@ -485,7 +486,9 @@ class TEndpoint final
break;
}

// TODO: limiter
ReportExternalEndpointUnexpectedExit(TStringBuilder()
<< "External endpoint " << Stats.DiskId << " " << Stats.ClientId
<< " unexpectedly stopped: " << FormatError(error));

auto process = RestartProcess();
if (!process) {
Expand Down Expand Up @@ -943,6 +946,7 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener(
IServerStatsPtr serverStats,
TExecutorPtr executor,
TString binaryPath,
TVector<TString> extArgs,
TString localAgentId,
IEndpointListenerPtr fallbackListener)
{
Expand All @@ -952,6 +956,8 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener(
TVector<TString> args,
TVector<TString> cgroups)
{
args.insert(args.end(), extArgs.begin(), extArgs.end());

return std::make_shared<TEndpoint>(
clientId,
logging,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener(
IServerStatsPtr serverStats,
TExecutorPtr executor,
TString binaryPath,
TVector<TString> extArgs,
TString localAgentId,
IEndpointListenerPtr fallbackListener);

Expand Down
9 changes: 3 additions & 6 deletions cloud/blockstore/libs/server/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ constexpr TDuration Seconds(int s)
NCloud::NProto::ENDPOINT_STORAGE_KEYRING )\
xxx(EndpointStorageDir, TString, {} )\
xxx(VhostServerPath, TString, {} )\
xxx(VhostServerExtArgs, TVector<TString>, {} )\
// BLOCKSTORE_SERVER_CONFIG

#define BLOCKSTORE_SERVER_DECLARE_CONFIG(name, type, value) \
Expand Down Expand Up @@ -116,19 +117,15 @@ template <>
TVector<TString> ConvertValue(
const google::protobuf::RepeatedPtrField<TString>& value)
{
TVector<TString> v;
for (const auto& x : value) {
v.push_back(x);
}
return v;
return { value.begin(), value.end() };
}

template <>
TVector<TCertificate> ConvertValue(
const google::protobuf::RepeatedPtrField<NCloud::NProto::TCertificate>& value)
{
TVector<TCertificate> v;
for (const auto& x : value) {
for (const auto& x: value) {
v.push_back({x.GetCertFile(), x.GetCertPrivateKeyFile()});
}
return v;
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/server/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ class TServerAppConfig
NCloud::NProto::EEndpointStorageType GetEndpointStorageType() const;
TString GetEndpointStorageDir() const;
TString GetVhostServerPath() const;
TVector<TString> GetVhostServerExtArgs() const;

void Dump(IOutputStream& out) const override;
void DumpHtml(IOutputStream& out) const override;
Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/public/sdk/python/client/base_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
"execute_action",
"kick_endpoint",
"cms_action",
"update_disk_registry_config",
"describe_disk_registry_config",
]


Expand Down
68 changes: 68 additions & 0 deletions cloud/blockstore/public/sdk/python/client/safe_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import cloud.blockstore.public.sdk.python.protos as protos

from google.protobuf.json_format import ParseDict

from .error import _handle_errors


Expand Down Expand Up @@ -901,3 +903,69 @@ def cms_action(
trace_id,
request_timeout)
return response

@_handle_errors
def update_disk_registry_config_async(
self,
config,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

request = ParseDict(config, protos.TUpdateDiskRegistryConfigRequest())

return self.__impl.update_disk_registry_config_async(
request,
idempotence_id,
timestamp,
trace_id,
request_timeout)

@_handle_errors
def update_disk_registry_config(
self,
config,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

request = ParseDict(config, protos.TUpdateDiskRegistryConfigRequest())

return self.__impl.update_disk_registry_config(
request,
idempotence_id,
timestamp,
trace_id,
request_timeout)

@_handle_errors
def describe_disk_registry_config_async(
self,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

return self.__impl.describe_disk_registry_config_async(
protos.TDescribeDiskRegistryConfigRequest(),
idempotence_id,
timestamp,
trace_id,
request_timeout)

@_handle_errors
def describe_disk_registry_config(
self,
idempotence_id=None,
timestamp=None,
trace_id=None,
request_timeout=None):

return self.__impl.describe_disk_registry_config(
protos.TDescribeDiskRegistryConfigRequest(),
idempotence_id,
timestamp,
trace_id,
request_timeout)
Loading

0 comments on commit 7057fee

Please sign in to comment.