From ccc881ecf5040b1be6c602e093c4ebb8cd7f2e50 Mon Sep 17 00:00:00 2001 From: sharpeye Date: Tue, 30 Jan 2024 16:41:45 +0000 Subject: [PATCH] NBSOPSNEBIUS-25: report ExternalEndpointUnexpectedExit on vhost-server crashes --- cloud/blockstore/config/server.proto | 4 + .../libs/daemon/common/bootstrap.cpp | 1 + .../libs/diagnostics/critical_events.h | 1 + .../endpoints_vhost/external_vhost_server.cpp | 8 +- .../endpoints_vhost/external_vhost_server.h | 1 + cloud/blockstore/libs/server/config.cpp | 9 +- cloud/blockstore/libs/server/config.h | 1 + .../public/sdk/python/client/base_client.py | 2 + .../public/sdk/python/client/safe_client.py | 68 ++++++ .../tests/external_endpoint/test.py | 229 ++++++++++++++++++ .../tests/external_endpoint/ya.make | 27 +++ cloud/blockstore/tests/ya.make | 1 + .../testing/fake-vhost-server/__main__.py | 26 +- .../tools/testing/fake-vhost-server/ya.make | 1 - 14 files changed, 361 insertions(+), 18 deletions(-) create mode 100644 cloud/blockstore/tests/external_endpoint/test.py create mode 100644 cloud/blockstore/tests/external_endpoint/ya.make diff --git a/cloud/blockstore/config/server.proto b/cloud/blockstore/config/server.proto index ddf0b0435a8..4309733e51f 100644 --- a/cloud/blockstore/config/server.proto +++ b/cloud/blockstore/config/server.proto @@ -168,6 +168,10 @@ message TServerConfig // Path to vhost server executable. optional string VhostServerPath = 108; + + // Additional vhost server command line arguments + // e.g. ["--pid-file", "/tmp/test-vhost-server-pid", "--verbose", "debug"] + repeated string VhostServerExtArgs = 109; } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/blockstore/libs/daemon/common/bootstrap.cpp b/cloud/blockstore/libs/daemon/common/bootstrap.cpp index 8aa63c30d24..c78209ec52d 100644 --- a/cloud/blockstore/libs/daemon/common/bootstrap.cpp +++ b/cloud/blockstore/libs/daemon/common/bootstrap.cpp @@ -407,6 +407,7 @@ void TBootstrapBase::Init() ServerStats, Executor, Configs->ServerConfig->GetVhostServerPath(), + Configs->ServerConfig->GetVhostServerExtArgs(), Configs->Options->SkipDeviceLocalityValidation ? TString {} : FQDNHostName(), diff --git a/cloud/blockstore/libs/diagnostics/critical_events.h b/cloud/blockstore/libs/diagnostics/critical_events.h index 194bda70a35..0ed1658d66d 100644 --- a/cloud/blockstore/libs/diagnostics/critical_events.h +++ b/cloud/blockstore/libs/diagnostics/critical_events.h @@ -51,6 +51,7 @@ namespace NCloud::NBlockStore { xxx(DiskRegistryDeviceNotFoundSoft) \ xxx(DiskRegistrySourceDiskNotFound) \ xxx(EndpointSwitchFailure) \ + xxx(ExternalEndpointUnexpectedExit) \ xxx(DiskAgentSessionCacheUpdateError) \ xxx(DiskAgentSessionCacheRestoreError) \ // BLOCKSTORE_CRITICAL_EVENTS diff --git a/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.cpp b/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.cpp index 29a89779599..63bd67d834e 100644 --- a/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.cpp +++ b/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.cpp @@ -2,6 +2,7 @@ #include "external_endpoint_stats.h" #include +#include #include #include @@ -485,7 +486,9 @@ class TEndpoint final break; } - // TODO: limiter + ReportExternalEndpointUnexpectedExit(TStringBuilder() + << "External endpoint " << Stats.DiskId << " " << Stats.ClientId + << " unexpectedly stopped: " << FormatError(error)); auto process = RestartProcess(); if (!process) { @@ -943,6 +946,7 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener( IServerStatsPtr serverStats, TExecutorPtr executor, TString binaryPath, + TVector extArgs, TString localAgentId, IEndpointListenerPtr fallbackListener) { @@ -952,6 +956,8 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener( TVector args, TVector cgroups) { + args.insert(args.end(), extArgs.begin(), extArgs.end()); + return std::make_shared( clientId, logging, diff --git a/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.h b/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.h index a2cfb184cb9..407c4c9f806 100644 --- a/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.h +++ b/cloud/blockstore/libs/endpoints_vhost/external_vhost_server.h @@ -37,6 +37,7 @@ IEndpointListenerPtr CreateExternalVhostEndpointListener( IServerStatsPtr serverStats, TExecutorPtr executor, TString binaryPath, + TVector extArgs, TString localAgentId, IEndpointListenerPtr fallbackListener); diff --git a/cloud/blockstore/libs/server/config.cpp b/cloud/blockstore/libs/server/config.cpp index 06f2522d8d8..1ebbfe70c24 100644 --- a/cloud/blockstore/libs/server/config.cpp +++ b/cloud/blockstore/libs/server/config.cpp @@ -88,6 +88,7 @@ constexpr TDuration Seconds(int s) NCloud::NProto::ENDPOINT_STORAGE_KEYRING )\ xxx(EndpointStorageDir, TString, {} )\ xxx(VhostServerPath, TString, {} )\ + xxx(VhostServerExtArgs, TVector, {} )\ // BLOCKSTORE_SERVER_CONFIG #define BLOCKSTORE_SERVER_DECLARE_CONFIG(name, type, value) \ @@ -116,11 +117,7 @@ template <> TVector ConvertValue( const google::protobuf::RepeatedPtrField& value) { - TVector v; - for (const auto& x : value) { - v.push_back(x); - } - return v; + return { value.begin(), value.end() }; } template <> @@ -128,7 +125,7 @@ TVector ConvertValue( const google::protobuf::RepeatedPtrField& value) { TVector v; - for (const auto& x : value) { + for (const auto& x: value) { v.push_back({x.GetCertFile(), x.GetCertPrivateKeyFile()}); } return v; diff --git a/cloud/blockstore/libs/server/config.h b/cloud/blockstore/libs/server/config.h index 49575d54189..d32a1bba71a 100644 --- a/cloud/blockstore/libs/server/config.h +++ b/cloud/blockstore/libs/server/config.h @@ -124,6 +124,7 @@ class TServerAppConfig NCloud::NProto::EEndpointStorageType GetEndpointStorageType() const; TString GetEndpointStorageDir() const; TString GetVhostServerPath() const; + TVector GetVhostServerExtArgs() const; void Dump(IOutputStream& out) const override; void DumpHtml(IOutputStream& out) const override; diff --git a/cloud/blockstore/public/sdk/python/client/base_client.py b/cloud/blockstore/public/sdk/python/client/base_client.py index 5f4563d535d..0bf4cd8fa20 100644 --- a/cloud/blockstore/public/sdk/python/client/base_client.py +++ b/cloud/blockstore/public/sdk/python/client/base_client.py @@ -41,6 +41,8 @@ "execute_action", "kick_endpoint", "cms_action", + "update_disk_registry_config", + "describe_disk_registry_config", ] diff --git a/cloud/blockstore/public/sdk/python/client/safe_client.py b/cloud/blockstore/public/sdk/python/client/safe_client.py index fde8fd79122..226b8946cc4 100644 --- a/cloud/blockstore/public/sdk/python/client/safe_client.py +++ b/cloud/blockstore/public/sdk/python/client/safe_client.py @@ -2,6 +2,8 @@ import cloud.blockstore.public.sdk.python.protos as protos +from google.protobuf.json_format import ParseDict + from .error import _handle_errors @@ -901,3 +903,69 @@ def cms_action( trace_id, request_timeout) return response + + @_handle_errors + def update_disk_registry_config_async( + self, + config, + idempotence_id=None, + timestamp=None, + trace_id=None, + request_timeout=None): + + request = ParseDict(config, protos.TUpdateDiskRegistryConfigRequest()) + + return self.__impl.update_disk_registry_config_async( + request, + idempotence_id, + timestamp, + trace_id, + request_timeout) + + @_handle_errors + def update_disk_registry_config( + self, + config, + idempotence_id=None, + timestamp=None, + trace_id=None, + request_timeout=None): + + request = ParseDict(config, protos.TUpdateDiskRegistryConfigRequest()) + + return self.__impl.update_disk_registry_config( + request, + idempotence_id, + timestamp, + trace_id, + request_timeout) + + @_handle_errors + def describe_disk_registry_config_async( + self, + idempotence_id=None, + timestamp=None, + trace_id=None, + request_timeout=None): + + return self.__impl.describe_disk_registry_config_async( + protos.TDescribeDiskRegistryConfigRequest(), + idempotence_id, + timestamp, + trace_id, + request_timeout) + + @_handle_errors + def describe_disk_registry_config( + self, + idempotence_id=None, + timestamp=None, + trace_id=None, + request_timeout=None): + + return self.__impl.describe_disk_registry_config( + protos.TDescribeDiskRegistryConfigRequest(), + idempotence_id, + timestamp, + trace_id, + request_timeout) diff --git a/cloud/blockstore/tests/external_endpoint/test.py b/cloud/blockstore/tests/external_endpoint/test.py new file mode 100644 index 00000000000..6699e801a82 --- /dev/null +++ b/cloud/blockstore/tests/external_endpoint/test.py @@ -0,0 +1,229 @@ +import os +import pytest +import requests +import tempfile +import time + +from cloud.blockstore.public.sdk.python.client import CreateClient, \ + ClientError +from cloud.blockstore.public.sdk.python.protos import TCmsActionRequest, \ + TAction, STORAGE_MEDIA_SSD_LOCAL, IPC_VHOST + +from cloud.blockstore.tests.python.lib.config import NbsConfigurator, \ + generate_disk_agent_txt +from cloud.blockstore.tests.python.lib.daemon import start_ydb, start_nbs, \ + start_disk_agent, get_fqdn + +import yatest.common as yatest_common + +from contrib.ydb.tests.library.common.yatest_common import PortManager +from contrib.ydb.tests.library.harness.kikimr_runner import get_unique_path_for_current_test, \ + ensure_path_exists + +from library.python.retry import retry + + +DEVICE_SIZE = 1024**2 +KNOWN_DEVICE_POOLS = { + "KnownDevicePools": [ + {"Name": "1Mb", "Kind": "DEVICE_POOL_KIND_LOCAL", "AllocationUnit": DEVICE_SIZE}, + ]} + + +@pytest.fixture(name='data_path') +def create_data_path(): + + p = get_unique_path_for_current_test( + output_path=yatest_common.output_path(), + sub_folder="data") + + p = os.path.join(p, "dev", "disk", "by-partlabel") + ensure_path_exists(p) + + return p + + +@pytest.fixture(autouse=True) +def create_device_files(data_path): + + for i in range(6): + with open(os.path.join(data_path, f"NVMENBS{i + 1:02}"), 'wb') as f: + f.seek(DEVICE_SIZE-1) + f.write(b'\0') + f.flush() + + +@pytest.fixture(name='disk_agent_configurator') +def create_disk_agent_configurator(ydb, data_path): + + configurator = NbsConfigurator(ydb, 'disk-agent') + configurator.generate_default_nbs_configs() + + disk_agent_config = generate_disk_agent_txt( + agent_id='', + device_erase_method='DEVICE_ERASE_METHOD_NONE', # speed up tests + storage_discovery_config={ + "PathConfigs": [{ + "PathRegExp": f"{data_path}/NVMENBS([0-9]+)", + "PoolConfigs": [{ + "PoolName": "1Mb", + "MinSize": DEVICE_SIZE, + "MaxSize": DEVICE_SIZE + }]}] + }) + + disk_agent_config.CachedSessionsPath = os.path.join( + get_unique_path_for_current_test(yatest_common.output_path(), ''), + "nbs-disk-agent-sessions.txt") + + configurator.files["disk-agent"] = disk_agent_config + + return configurator + + +@pytest.fixture(name='ydb') +def start_ydb_cluster(): + + ydb_cluster = start_ydb() + + yield ydb_cluster + + ydb_cluster.stop() + + +@pytest.fixture(name='vhost_server_port') +def setup_fake_vhost_server_port(): + return PortManager().get_port() + + +@pytest.fixture(name='nbs') +def start_nbs_daemon(ydb, vhost_server_port): + + cfg = NbsConfigurator(ydb) + cfg.generate_default_nbs_configs() + + server = cfg.files["server"].ServerConfig + server.VhostEnabled = True + server.VhostServerPath = yatest_common.binary_path( + "cloud/blockstore/tools/testing/fake-vhost-server/fake-vhost-server") + server.VhostServerExtArgs.extend(["--port", str(vhost_server_port)]) + + daemon = start_nbs(cfg) + + yield daemon + + daemon.kill() + + +@pytest.fixture(name='disk_agent') +def start_disk_agent_daemon(ydb, disk_agent_configurator): + + daemon = start_disk_agent(disk_agent_configurator) + + yield daemon + + daemon.kill() + + +@pytest.fixture(autouse=True) +def setup_env(nbs, disk_agent, data_path): + + client = CreateClient(f"localhost:{nbs.port}") + client.execute_action( + action="DiskRegistrySetWritableState", + input_bytes=str.encode('{"State": true}')) + + client.update_disk_registry_config(KNOWN_DEVICE_POOLS) + + disk_agent.wait_for_registration() + + request = TCmsActionRequest() + + for i in range(2): + action = request.Actions.add() + action.Type = TAction.ADD_DEVICE + action.Host = get_fqdn() + action.Device = os.path.join(data_path, f"NVMENBS{i + 1:02}") + + response = client.cms_action(request) + + assert len(response.ActionResults) == 2 + for r in response.ActionResults: + assert r.Result.Code == 0, r + + +def test_external_endpoint(nbs, vhost_server_port): + + client = CreateClient(f"localhost:{nbs.port}") + + @retry(max_times=10, exception=ClientError) + def create_vol0(): + client.create_volume( + disk_id="vol0", + block_size=4096, + blocks_count=2 * DEVICE_SIZE//4096, + storage_media_kind=STORAGE_MEDIA_SSD_LOCAL, + storage_pool_name="1Mb") + + vhost_server_url = f"http://localhost:{vhost_server_port}" + + @retry(max_times=10, exception=ClientError) + def wait_for_vhost_server(): + r = requests.get(f"{vhost_server_url}/ping").text + assert r == "pong" + + create_vol0() + + with tempfile.NamedTemporaryFile() as unix_socket: + r = client.start_endpoint( + unix_socket_path=unix_socket.name, + disk_id="vol0", + ipc_type=IPC_VHOST, + client_id="test" + ) + + assert r["Volume"].DiskId == "vol0" + + wait_for_vhost_server() + + r = requests.get(f"{vhost_server_url}/describe").json() + devices = r["devices"] + assert len(devices) == 2 + + for d in devices: + assert d[0].find("NVMENBS") != -1 + assert d[1] == DEVICE_SIZE + assert d[2] == 0 + + crit = nbs.counters.find({ + 'sensor': 'AppCriticalEvents/ExternalEndpointUnexpectedExit' + }) + assert crit is not None + assert crit['value'] == 0 + + try: + _ = requests.post(f"{vhost_server_url}/terminate", data='{"exit_code": 42}') + except Exception: + pass + + # wait for the counters to be updated + time.sleep(15) + + crit = nbs.counters.find({ + 'sensor': 'AppCriticalEvents/ExternalEndpointUnexpectedExit' + }) + assert crit is not None + assert crit['value'] == 1 + + wait_for_vhost_server() + + client.stop_endpoint(unix_socket.name) + + # wait for the counters to be updated + time.sleep(15) + + crit = nbs.counters.find({ + 'sensor': 'AppCriticalEvents/ExternalEndpointUnexpectedExit' + }) + assert crit is not None + assert crit['value'] == 1 # the same value diff --git a/cloud/blockstore/tests/external_endpoint/ya.make b/cloud/blockstore/tests/external_endpoint/ya.make new file mode 100644 index 00000000000..7f8d99fb188 --- /dev/null +++ b/cloud/blockstore/tests/external_endpoint/ya.make @@ -0,0 +1,27 @@ +PY3TEST() + +INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/medium.inc) + +TEST_SRCS(test.py) + +DEPENDS( + cloud/blockstore/apps/client + cloud/blockstore/apps/disk_agent + cloud/blockstore/apps/server + cloud/blockstore/tools/testing/fake-vhost-server + contrib/ydb/apps/ydbd +) + +DATA( + arcadia/cloud/blockstore/tests/certs/server.crt + arcadia/cloud/blockstore/tests/certs/server.key +) + +PEERDIR( + cloud/blockstore/tests/python/lib + contrib/python/requests/py3 + + library/python/retry +) + +END() diff --git a/cloud/blockstore/tests/ya.make b/cloud/blockstore/tests/ya.make index aaffe07e88e..d6b8f0bbc64 100644 --- a/cloud/blockstore/tests/ya.make +++ b/cloud/blockstore/tests/ya.make @@ -8,6 +8,7 @@ ENDIF() RECURSE( client cms + external_endpoint fio functional fuzzing diff --git a/cloud/blockstore/tools/testing/fake-vhost-server/__main__.py b/cloud/blockstore/tools/testing/fake-vhost-server/__main__.py index 4403a66751f..24740524e3c 100644 --- a/cloud/blockstore/tools/testing/fake-vhost-server/__main__.py +++ b/cloud/blockstore/tools/testing/fake-vhost-server/__main__.py @@ -38,9 +38,6 @@ def __init__(self, s: str): self.size = int(s[j+1:i]) self.path = s[:j] - def __str__(self): - return f"[{self.path} {self.size} {self.offset}]" - class _App: @@ -66,6 +63,15 @@ def do_GET(self): self.wfile.write('pong'.encode('utf-8')) return + if self.path == '/describe': + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(json.dumps({ + "devices": [[d.path, d.size, d.offset] for d in args.device] + }).encode('utf-8')) + return + self.send_error(400) def do_POST(self): @@ -74,15 +80,15 @@ def do_POST(self): length = int(self.headers['content-length']) req = self.rfile.read(length).decode('utf-8') - if self.path != '/crash': - self.send_error(400) - return + if self.path == '/terminate': + data = json.loads(req) - data = json.loads(req) + exit_code = data.get('exit_code', -1) + logging.info(f'exit with {exit_code}...') + app.terminate(exit_code) + return - exit_code = data.get('exit_code', -1) - logging.info(f'exit with {exit_code}...') - app.terminate(exit_code) + self.send_error(400) return Handler diff --git a/cloud/blockstore/tools/testing/fake-vhost-server/ya.make b/cloud/blockstore/tools/testing/fake-vhost-server/ya.make index 586eb64696a..d3481ed7712 100644 --- a/cloud/blockstore/tools/testing/fake-vhost-server/ya.make +++ b/cloud/blockstore/tools/testing/fake-vhost-server/ya.make @@ -1,7 +1,6 @@ PY3_PROGRAM() PEERDIR( - # contrib/python/requests/py3 ) PY_SRCS(