Skip to content

Commit

Permalink
upon device allocation DiskRegistry should prefer agents and racks wi…
Browse files Browse the repository at this point in the history
…th the least amount of occupied space and only after that should take free space into account

[NBS] При аллокации места в DiskRegistry предпочитать стойки и ноды с меньшим кол-вом занятого места

Сейчас выбираем стойки и ноды с наибольшим кол-вом свободного. Это плохо работает в ситуации, когда у нас неравномерная капасити по разным нодам - есть ноды с 3ТБ места, есть с 12ТБ. В итоге мы битком забиваем ноды с 12ТБ и на них льется непропорционально бОльшая нагрузка. Сначала надо выбирать по min occupied space, потом по max free space.
  • Loading branch information
qkrorlqr committed Dec 19, 2023
1 parent d65d595 commit 4f47a6c
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -410,9 +410,9 @@ Y_UNIT_TEST_SUITE(TDiskRegistryStateMirroredDisksTest)
UNIT_ASSERT_VALUES_EQUAL("dev-1", devices[0].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-2", devices[1].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-3", devices[2].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-4", devices[3].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-5", devices[4].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-6", devices[5].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-10", devices[3].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-11", devices[4].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL("dev-12", devices[5].GetDeviceName());
UNIT_ASSERT_VALUES_EQUAL(1, replicas.size());
UNIT_ASSERT_VALUES_EQUAL(6, replicas[0].size());
UNIT_ASSERT_VALUES_EQUAL("dev-7", replicas[0][0].GetDeviceName());
Expand Down
101 changes: 62 additions & 39 deletions cloud/blockstore/libs/storage/disk_registry/model/device_list.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ TDeviceList::TDeviceList(

void TDeviceList::UpdateDevices(const NProto::TAgentConfig& agent, TNodeId prevNodeId)
{
FreeDevices.erase(prevNodeId);
NodeDevices.erase(prevNodeId);
UpdateDevices(agent);
}

Expand All @@ -113,15 +113,16 @@ void TDeviceList::UpdateDevices(const NProto::TAgentConfig& agent)
return;
}

auto& freeDevices = FreeDevices[agent.GetNodeId()];
freeDevices.Devices.clear();
freeDevices.Rack.clear();
auto& nodeDevices = NodeDevices[agent.GetNodeId()];
nodeDevices.FreeDevices.clear();
nodeDevices.Rack.clear();
nodeDevices.TotalSize = 0;

for (const auto& device: agent.GetDevices()) {
if (device.GetState() == NProto::DEVICE_STATE_ONLINE
&& !device.GetRack().empty())
{
freeDevices.Rack = device.GetRack();
nodeDevices.Rack = device.GetRack();
break;
}
}
Expand All @@ -140,7 +141,7 @@ void TDeviceList::UpdateDevices(const NProto::TAgentConfig& agent)
const auto& uuid = device.GetDeviceUUID();
UpdateInAllDevices(uuid, device);

if (device.GetRack() != freeDevices.Rack) {
if (device.GetRack() != nodeDevices.Rack) {
continue;
}

Expand All @@ -150,7 +151,7 @@ void TDeviceList::UpdateDevices(const NProto::TAgentConfig& agent)
!DirtyDevices.contains(uuid) &&
!SuspendedDevices.contains(uuid))
{
freeDevices.Devices.push_back(device);
nodeDevices.FreeDevices.push_back(device);
}

auto& poolNames = PoolKind2PoolNames[device.GetPoolKind()];
Expand All @@ -159,14 +160,17 @@ void TDeviceList::UpdateDevices(const NProto::TAgentConfig& agent)
if (it == poolNames.end()) {
poolNames.push_back(device.GetPoolName());
}

nodeDevices.TotalSize +=
device.GetBlockSize() * device.GetBlocksCount();
}

SortBy(freeDevices.Devices, TBySortQueryKey());
SortBy(nodeDevices.FreeDevices, TBySortQueryKey());
}

void TDeviceList::RemoveDevices(const NProto::TAgentConfig& agent)
{
FreeDevices.erase(agent.GetNodeId());
NodeDevices.erase(agent.GetNodeId());

for (const auto& device: agent.GetDevices()) {
const auto& uuid = device.GetDeviceUUID();
Expand Down Expand Up @@ -220,16 +224,16 @@ NProto::TDeviceConfig TDeviceList::AllocateDevice(
const TDiskId& diskId,
const TAllocationQuery& query)
{
for (auto& kv: FreeDevices) {
for (auto& kv: NodeDevices) {
if (!query.NodeIds.empty() && !query.NodeIds.contains(kv.first)) {
continue;
}

const ui32 nodeId = kv.first;
auto& freeDevices = kv.second;
auto& nodeDevices = kv.second;

const auto& currentRack = freeDevices.Rack;
auto& devices = freeDevices.Devices;
const auto& currentRack = nodeDevices.Rack;
auto& devices = nodeDevices.FreeDevices;

if (devices.empty() || query.ForbiddenRacks.contains(currentRack)) {
continue;
Expand Down Expand Up @@ -336,24 +340,24 @@ bool TDeviceList::ValidateAllocationQuery(
return false;
}

const auto freeItr = FreeDevices.find(node);
if (freeItr == FreeDevices.end()) {
const auto nodeItr = NodeDevices.find(node);
if (nodeItr == NodeDevices.end()) {
return false;
}

const TFreeDevices& freeDevices = freeItr->second;
const TNodeDevices& nodeDevices = nodeItr->second;

if (query.ForbiddenRacks.contains(freeDevices.Rack)) {
if (query.ForbiddenRacks.contains(nodeDevices.Rack)) {
return false;
}

const auto freeDeviceItr = FindIf(
freeDevices.Devices,
nodeDevices.FreeDevices,
[&targetDeviceId] (const NProto::TDeviceConfig& device) {
return device.GetDeviceUUID() == targetDeviceId;
});

if (freeDeviceItr == freeDevices.Devices.end()) {
if (freeDeviceItr == nodeDevices.FreeDevices.end()) {
return false;
}

Expand All @@ -373,8 +377,10 @@ void TDeviceList::MarkDeviceAllocated(const TDiskId& diskId, const TDeviceId& id
AllocatedDevices.emplace(id, diskId);
}

// returns a list of racks sorted by preference and then by free space
// the nodes in each rack are sorted by free space
// returns a list of racks sorted by preference and then by occupied space ASC
// then by free space DESC
// the nodes in each rack are sorted by occupied space ASC then by free space
// DESC
auto TDeviceList::SelectRacks(
const TAllocationQuery& query,
const TString& poolName) const -> TVector<TRack>
Expand All @@ -388,39 +394,53 @@ auto TDeviceList::SelectRacks(

auto& rack = racks[currentRack];
rack.Id = currentRack;
rack.Nodes.push_back({nodeId, 0});
rack.Nodes.push_back({nodeId, 0, 0});
rack.Preferred = query.PreferredRacks.contains(currentRack);
};

if (!query.NodeIds.empty()) {
for (ui32 id: query.NodeIds) {
if (auto* freeDevices = FreeDevices.FindPtr(id)) {
appendNode(freeDevices->Rack, id);
if (auto* nodeDevices = NodeDevices.FindPtr(id)) {
appendNode(nodeDevices->Rack, id);
}
}
} else {
for (auto& [nodeId, freeDevices]: FreeDevices) {
appendNode(freeDevices.Rack, nodeId);
for (auto& [nodeId, nodeDevices]: NodeDevices) {
appendNode(nodeDevices.Rack, nodeId);
}
}

for (auto& [id, rack]: racks) {
ui64 rackTotalSpace = 0;

for (auto& node: rack.Nodes) {
const auto* freeDevices = FreeDevices.FindPtr(node.NodeId);
Y_ABORT_UNLESS(freeDevices);
const auto* nodeDevices = NodeDevices.FindPtr(node.NodeId);
Y_ABORT_UNLESS(nodeDevices);

auto r = FindDeviceRange(query, poolName, freeDevices->Devices);
auto r = FindDeviceRange(query, poolName, nodeDevices->FreeDevices);
node.OccupiedSpace = nodeDevices->TotalSize;
rackTotalSpace += nodeDevices->TotalSize;

for (const auto& device: MakeIteratorRange(r)) {
const auto s = device.GetBlockSize() * device.GetBlocksCount();
rack.FreeSpace += s;
Y_DEBUG_ABORT_UNLESS(node.OccupiedSpace >= s);
node.OccupiedSpace -= s;
node.FreeSpace += s;
}
}

SortBy(rack.Nodes, [] (const TNodeInfo& node) {
return Max<ui64>() - node.FreeSpace;
});
Sort(
rack.Nodes,
[] (const TNodeInfo& lhs, const TNodeInfo& rhs) {
if (lhs.OccupiedSpace != rhs.OccupiedSpace) {
return lhs.OccupiedSpace < rhs.OccupiedSpace;
}

return lhs.FreeSpace > rhs.FreeSpace;
});

rack.OccupiedSpace = rackTotalSpace - rack.FreeSpace;
}

TVector<TRack*> bySpace;
Expand All @@ -436,6 +456,9 @@ auto TDeviceList::SelectRacks(
if (lhs->Preferred != rhs->Preferred) {
return lhs->Preferred > rhs->Preferred;
}
if (lhs->OccupiedSpace != rhs->OccupiedSpace) {
return lhs->OccupiedSpace < rhs->OccupiedSpace;
}
if (lhs->FreeSpace != rhs->FreeSpace) {
return lhs->FreeSpace > rhs->FreeSpace;
}
Expand Down Expand Up @@ -465,13 +488,13 @@ TVector<TDeviceList::TDeviceRange> TDeviceList::CollectDevices(

for (const auto& rack: SelectRacks(query, poolName)) {
for (const auto& node: rack.Nodes) {
const auto* freeDevices = FreeDevices.FindPtr(node.NodeId);
Y_ABORT_UNLESS(freeDevices);
const auto* nodeDevices = NodeDevices.FindPtr(node.NodeId);
Y_ABORT_UNLESS(nodeDevices);

// finding free devices belonging to this node that match our
// query
auto [begin, end] =
FindDeviceRange(query, poolName, freeDevices->Devices);
FindDeviceRange(query, poolName, nodeDevices->FreeDevices);

using TDeviceIter = decltype(begin);
struct TDeviceInfo
Expand Down Expand Up @@ -513,7 +536,7 @@ TVector<TDeviceList::TDeviceRange> TDeviceList::CollectDevices(
for (; it != deviceInfo.Range.second; ++it) {
const auto& device = *it;

Y_DEBUG_ABORT_UNLESS(device.GetRack() == freeDevices->Rack);
Y_DEBUG_ABORT_UNLESS(device.GetRack() == nodeDevices->Rack);

const ui64 size = device.GetBlockSize() * device.GetBlocksCount();

Expand Down Expand Up @@ -595,10 +618,10 @@ TVector<NProto::TDeviceConfig> TDeviceList::AllocateDevices(
return l.first > r.first;
});

auto& freeDevices = FreeDevices[nodeId];
auto& nodeDevices = NodeDevices[nodeId];

for (const auto& arange: aranges) {
freeDevices.Devices.erase(arange.first, arange.second);
nodeDevices.FreeDevices.erase(arange.first, arange.second);
}
}

Expand Down Expand Up @@ -644,7 +667,7 @@ void TDeviceList::RemoveDeviceFromFreeList(const TDeviceId& id)
auto nodeId = FindNodeId(id);

if (nodeId) {
auto& devices = FreeDevices[nodeId].Devices;
auto& devices = NodeDevices[nodeId].FreeDevices;

auto it = FindIf(devices, [&] (const auto& x) {
return x.GetDeviceUUID() == id;
Expand Down
10 changes: 7 additions & 3 deletions cloud/blockstore/libs/storage/disk_registry/model/device_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,26 @@ class TDeviceList
{
TNodeId NodeId = 0;
ui64 FreeSpace = 0;
ui64 OccupiedSpace = 0;
};

struct TRack
{
TString Id;
TVector<TNodeInfo> Nodes;
ui64 FreeSpace = 0;
ui64 OccupiedSpace = 0;
bool Preferred = false;
};

struct TFreeDevices
struct TNodeDevices
{
TString Rack;

// sorted by {PoolKind, BlockSize}
TVector<NProto::TDeviceConfig> Devices;
TVector<NProto::TDeviceConfig> FreeDevices;

ui64 TotalSize = 0;
};

using TDeviceRange = std::tuple<
Expand All @@ -50,7 +54,7 @@ class TDeviceList

private:
THashMap<TDeviceId, NProto::TDeviceConfig> AllDevices;
THashMap<TNodeId, TFreeDevices> FreeDevices;
THashMap<TNodeId, TNodeDevices> NodeDevices;
THashMap<TDeviceId, TDiskId> AllocatedDevices;
THashSet<TDeviceId> DirtyDevices;
THashMap<TDeviceId, NProto::TSuspendedDevice> SuspendedDevices;
Expand Down
Loading

0 comments on commit 4f47a6c

Please sign in to comment.