Skip to content

Commit

Permalink
added test_disable_io_for_broken_devices
Browse files Browse the repository at this point in the history
  • Loading branch information
sharpeye committed Nov 6, 2024
1 parent 2b6bf5b commit 1e1a3b6
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 4 deletions.
9 changes: 9 additions & 0 deletions cloud/blockstore/config/disk.proto
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,15 @@ message TDiskAgentConfig

// Disable devices that have been recognized as broken by the DR
optional bool DisableBrokenDevices = 36;

// Path to serial number mapping. For testing purposes only.
message TPathToSerialNumber
{
required string Path = 1;
required string SerialNumber = 2;
}

repeated TPathToSerialNumber PathToSerialNumberMapping = 37;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
5 changes: 5 additions & 0 deletions cloud/blockstore/libs/storage/disk_agent/model/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ class TDiskAgentConfig
return Config.GetDevicesWithSuspendedIO();
}

const auto& GetPathToSerialNumberMapping() const
{
return Config.GetPathToSerialNumberMapping();
}

void Dump(IOutputStream& out) const;
void DumpHtml(IOutputStream& out) const;
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ TInitializer::TInitializer(
, StorageProvider(std::move(storageProvider))
, NvmeManager(std::move(nvmeManager))
{
for (const auto& m: AgentConfig->GetPathToSerialNumberMapping()) {
PathToSerial.emplace(m.GetPath(), m.GetSerialNumber());
}

auto fileDevices = AgentConfig->GetFileDevices();

FileDevices.assign(
Expand Down
101 changes: 98 additions & 3 deletions cloud/blockstore/tests/disk_agent_config/test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import json
import logging
import os
import pytest
import requests
import time

from cloud.blockstore.public.sdk.python.client import CreateClient, Session
from cloud.blockstore.public.sdk.python.client.error import ClientError
from cloud.blockstore.public.sdk.python.client.error_codes import EResult
from cloud.blockstore.public.sdk.python.protos import TCmsActionRequest, \
TAction, STORAGE_MEDIA_SSD_NONREPLICATED
from cloud.blockstore.config.disk_pb2 import DISK_AGENT_BACKEND_NULL
Expand All @@ -13,14 +16,18 @@
generate_disk_agent_txt
from cloud.blockstore.tests.python.lib.daemon import start_ydb, start_nbs, \
start_disk_agent
from cloud.blockstore.config.disk_pb2 import TDiskAgentConfig

import yatest.common as yatest_common

from contrib.ydb.tests.library.harness.kikimr_runner import \
get_unique_path_for_current_test, ensure_path_exists

from google.protobuf.text_format import MessageToString, Parse


DEVICE_SIZE = 1024 ** 3 # 1 GiB
DEVICES_PER_PATH = 6

KNOWN_DEVICE_POOLS = {
"KnownDevicePools": [
Expand Down Expand Up @@ -91,7 +98,7 @@ def create_device_files(data_path, agent_ids):
for agent_id in agent_ids:
p = _get_agent_data_path(agent_id, data_path)
with open(os.path.join(p, 'NVMENBS01'), 'wb') as f:
os.truncate(f.fileno(), 6 * (DEVICE_SIZE + 4096))
os.truncate(f.fileno(), DEVICES_PER_PATH * (DEVICE_SIZE + 4096))


def _create_disk_agent_configurator(ydb, agent_id, data_path):
Expand All @@ -116,6 +123,16 @@ def _create_disk_agent_configurator(ydb, agent_id, data_path):
}]}
]})

caches = os.path.join(
get_unique_path_for_current_test(
output_path=yatest_common.output_path(),
sub_folder="caches"),
agent_id)
ensure_path_exists(caches)

disk_agent_config.CachedConfigPath = os.path.join(caches, 'config.txt')
disk_agent_config.DisableBrokenDevices = True

configurator.files["disk-agent"] = disk_agent_config
configurator.files["location"].Rack = 'c:RACK'

Expand Down Expand Up @@ -170,7 +187,7 @@ def test_change_rack(nbs, agent_ids, disk_agent_configurators):

assert len(bkp['Agents']) == len(agent_ids)
for agent in bkp['Agents']:
assert len(agent['Devices']) == 6
assert len(agent['Devices']) == DEVICES_PER_PATH
assert agent.get('State') is None # online
for d in agent['Devices']:
assert d.get('State') is None # online
Expand Down Expand Up @@ -224,7 +241,7 @@ def test_change_rack(nbs, agent_ids, disk_agent_configurators):

for agent_id, agent in zip(agent_ids, bkp['Agents']):
assert agent_id == agent['AgentId']
assert len(agent['Devices']) == 6
assert len(agent['Devices']) == DEVICES_PER_PATH
assert agent.get('State') is None # online

for d in agent['Devices']:
Expand Down Expand Up @@ -309,3 +326,81 @@ def test_disable_node_broker_registration(nbs, agent_ids, disk_agent_configurato

for agent in agents:
agent.kill()


def test_disable_io_for_broken_devices(
nbs,
data_path,
agent_ids,
disk_agent_configurators):

agent_id = agent_ids[0]
configurator = disk_agent_configurators[0]

m = configurator.files["disk-agent"].PathToSerialNumberMapping.add()
m.Path = os.path.join(_get_agent_data_path(agent_id, data_path), 'NVMENBS01')
m.SerialNumber = 'SN'

logger = logging.getLogger("client")
logger.setLevel(logging.DEBUG)

client = CreateClient(f"localhost:{nbs.port}", log=logger)

# run an agent
agent = start_disk_agent(configurator, name=agent_id)

agent.wait_for_registration()
r = _add_host(client, agent_id)
assert len(r.ActionResults) == 1
assert r.ActionResults[0].Result.Code == 0

# create a volume
client.create_volume(
disk_id="vol1",
block_size=4096,
blocks_count=DEVICE_SIZE//4096,
storage_media_kind=STORAGE_MEDIA_SSD_NONREPLICATED,
cloud_id="test")

bkp = _backup(client)
assert len(bkp['Disks']) == 1
assert len(bkp['Disks'][0]['DeviceUUIDs']) == 1
assert len(bkp['Agents']) == 1
for d in bkp['Agents'][0]['Devices']:
assert d.get('SerialNumber') == 'SN'

session = Session(client, "vol1", "")
session.mount_volume()
session.write_blocks(0, [b'\1' * 4096])
blocks = session.read_blocks(0, 1, checkpoint_id="")
assert len(blocks) == 1

# stop the agent
agent.kill()

# start an IO operation
future = session.read_blocks_async(0, 1, checkpoint_id="")

assert not future.done()
time.sleep(5)
assert not future.done()

# change the serial number of NVMENBS01
m.SerialNumber = 'XXX'

# restart the agents. It resets the serial number of uuid-1
agent = start_disk_agent(configurator, name=agent_id+'.new')
agent.wait_for_registration()

# IOs should result in E_IO_SILENT
try:
_ = future.result()
assert False
except ClientError as e:
assert e.code == EResult.E_IO_SILENT.value

session.unmount_volume()

bkp = _backup(client)
for d in bkp['Agents'][0]['Devices']:
assert d.get('SerialNumber') == 'XXX'
8 changes: 7 additions & 1 deletion cloud/blockstore/tests/python/lib/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,10 @@ def counters(self):

class DiskAgent(Daemon):

def __init__(self, mon_port, server_port, commands, cwd):
def __init__(self, mon_port, server_port, commands, cwd, config_path):
self.__mon_port = mon_port
self.__port = server_port
self.__config_path = config_path

super(DiskAgent, self).__init__(
commands=commands,
Expand All @@ -116,6 +117,10 @@ def port(self):
def mon_port(self):
return self.__mon_port

@property
def config_path(self):
return self.__config_path

@property
def counters(self):
return _get_counters(self.mon_port)
Expand Down Expand Up @@ -194,6 +199,7 @@ def start_disk_agent(config: NbsConfigurator, name='disk-agent'):
server_port=config.server_port,
commands=[commands],
cwd=cwd,
config_path=config_path,
)

agent.start()
Expand Down

0 comments on commit 1e1a3b6

Please sign in to comment.