Skip to content

Commit

Permalink
NBS-4681: cache DA sessions in a file (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
sharpeye committed Jan 9, 2024
1 parent f799963 commit a3c1cb3
Show file tree
Hide file tree
Showing 17 changed files with 828 additions and 28 deletions.
34 changes: 34 additions & 0 deletions cloud/blockstore/config/disk.proto
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,40 @@ message TDiskAgentConfig

// The path where Disk Agent will store the cached config.
optional string CachedConfigPath = 27;

// The path where Disk Agent will store active sessions.
optional string CachedSessionsPath = 28;
}

////////////////////////////////////////////////////////////////////////////////

message TDiskAgentDeviceSession
{
// Owner of the session.
optional string ClientId = 1;

// List of device UUIDs.
repeated string DeviceIds = 2;

// Access mode.
optional bool ReadOnly = 3;

// MountSeqNumber from volume state.
optional uint64 MountSeqNumber = 4;

// Disk id.
optional string DiskId = 5;

// Volume generation.
optional uint32 VolumeGeneration = 6;

// Last activity timestamp of the session (in microseconds).
optional uint64 LastActivityTs = 7;
};

message TDiskAgentDeviceSessionCache
{
repeated TDiskAgentDeviceSession Sessions = 1;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
3 changes: 3 additions & 0 deletions cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ namespace NCloud::NBlockStore {
xxx(DiskAgentConfigMismatch) \
xxx(DiskRegistryDeviceNotFoundSoft) \
xxx(DiskRegistrySourceDiskNotFound) \
xxx(EndpointSwitchFailure) \
xxx(DiskAgentSessionCacheUpdateError) \
xxx(DiskAgentSessionCacheRestoreError) \
// BLOCKSTORE_CRITICAL_EVENTS

#define BLOCKSTORE_IMPOSSIBLE_EVENTS(xxx) \
Expand Down
123 changes: 113 additions & 10 deletions cloud/blockstore/libs/storage/disk_agent/disk_agent_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@

#include <cloud/storage/core/libs/common/error.h>
#include <cloud/storage/core/libs/common/format.h>
#include <cloud/storage/core/libs/common/proto_helpers.h>
#include <cloud/storage/core/libs/common/sglist.h>
#include <cloud/storage/core/libs/common/thread.h>
#include <cloud/storage/core/libs/common/verify.h>

#include <library/cpp/monlib/service/pages/templates.h>
#include <library/cpp/protobuf/util/pb_io.h>

#include <util/string/join.h>
#include <util/system/fs.h>
#include <util/system/hostname.h>

namespace NCloud::NBlockStore::NStorage {
Expand Down Expand Up @@ -252,6 +256,7 @@ TDiskAgentState::TDiskAgentState(
, ProfileLog(std::move(profileLog))
, BlockDigestGenerator(std::move(blockDigestGenerator))
, Logging(std::move(logging))
, Log(Logging->CreateLog("BLOCKSTORE_DISK_AGENT"))
, RdmaServer(std::move(rdmaServer))
, NvmeManager(std::move(nvmeManager))
{
Expand Down Expand Up @@ -455,6 +460,8 @@ TFuture<TInitializeResult> TDiskAgentState::Initialize(
Logging->CreateLog("BLOCKSTORE_DISK_AGENT"));

InitRdmaTarget(std::move(rdmaTargetConfig));

RestoreSessions(*DeviceClient);
});
}

Expand Down Expand Up @@ -751,18 +758,16 @@ void TDiskAgentState::AcquireDevices(
const TString& diskId,
ui32 volumeGeneration)
{
auto error = DeviceClient->AcquireDevices(
CheckError(DeviceClient->AcquireDevices(
uuids,
clientId,
now,
accessMode,
mountSeqNumber,
diskId,
volumeGeneration);
volumeGeneration));

if (HasError(error)) {
ythrow TServiceError(error);
}
UpdateSessionCache(*DeviceClient);
}

void TDiskAgentState::ReleaseDevices(
Expand All @@ -771,15 +776,13 @@ void TDiskAgentState::ReleaseDevices(
const TString& diskId,
ui32 volumeGeneration)
{
auto error = DeviceClient->ReleaseDevices(
CheckError(DeviceClient->ReleaseDevices(
uuids,
clientId,
diskId,
volumeGeneration);
volumeGeneration));

if (HasError(error)) {
ythrow TServiceError(error);
}
UpdateSessionCache(*DeviceClient);
}

void TDiskAgentState::DisableDevice(const TString& uuid)
Expand Down Expand Up @@ -815,4 +818,104 @@ void TDiskAgentState::StopTarget()
}
}

void TDiskAgentState::UpdateSessionCache(TDeviceClient& client) const
{
const auto path = AgentConfig->GetCachedSessionsPath();

if (path.empty()) {
STORAGE_INFO("Session cache is not configured.");
return;
}

try {
auto sessions = client.GetSessions();

NProto::TDiskAgentDeviceSessionCache proto;
proto.MutableSessions()->Assign(
std::make_move_iterator(sessions.begin()),
std::make_move_iterator(sessions.end())
);

const TString tmpPath {path + ".tmp"};

SerializeToTextFormat(proto, tmpPath);

if (!NFs::Rename(tmpPath, path)) {
const auto ec = errno;
ythrow TServiceError {MAKE_SYSTEM_ERROR(ec)} << strerror(ec);
}
} catch (...) {
STORAGE_ERROR("Can't update session cache: " << CurrentExceptionMessage());
ReportDiskAgentSessionCacheUpdateError();
}
}

void TDiskAgentState::RestoreSessions(TDeviceClient& client) const
{
const auto path = AgentConfig->GetCachedSessionsPath();

if (path.empty()) {
STORAGE_INFO("Session cache is not configured.");
return;
}

if (!NFs::Exists(path)) {
STORAGE_INFO("Session cache is empty.");
return;
}

try {
NProto::TDiskAgentDeviceSessionCache proto;

ParseProtoTextFromFileRobust(path, proto);

auto& sessions = *proto.MutableSessions();

STORAGE_INFO("Found " << sessions.size()
<< " sessions in the session cache: " << JoinSeq(" ", sessions));

int errors = 0;

for (auto& session: sessions) {
TVector<TString> uuids(
std::make_move_iterator(session.MutableDeviceIds()->begin()),
std::make_move_iterator(session.MutableDeviceIds()->end()));

const auto error = client.AcquireDevices(
uuids,
session.GetClientId(),
TInstant::MicroSeconds(session.GetLastActivityTs()),
session.GetReadOnly()
? NProto::VOLUME_ACCESS_READ_ONLY
: NProto::VOLUME_ACCESS_READ_WRITE,
session.GetMountSeqNumber(),
session.GetDiskId(),
session.GetVolumeGeneration());

if (HasError(error)) {
++errors;

STORAGE_ERROR("Can't restore session "
<< session.GetClientId().Quote() << " from the cache: "
<< FormatError(error));

client.ReleaseDevices(
uuids,
session.GetClientId(),
session.GetDiskId(),
session.GetVolumeGeneration());
}
}

if (errors) {
ReportDiskAgentSessionCacheRestoreError(
"some sessions have not recovered");
}
} catch (...) {
STORAGE_ERROR("Can't restore sessions from the cache: "
<< CurrentExceptionMessage());
ReportDiskAgentSessionCacheRestoreError();
}
}

} // namespace NCloud::NBlockStore::NStorage
4 changes: 4 additions & 0 deletions cloud/blockstore/libs/storage/disk_agent/disk_agent_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class TDiskAgentState
const IBlockDigestGeneratorPtr BlockDigestGenerator;

ILoggingServicePtr Logging;
TLog Log;
NSpdk::ISpdkTargetPtr SpdkTarget;
NRdma::IServerPtr RdmaServer;
IRdmaTargetPtr RdmaTarget;
Expand Down Expand Up @@ -156,6 +157,9 @@ class TDiskAgentState
NThreading::TFuture<TInitializeResult> InitAioStorage();

void InitRdmaTarget(TRdmaTargetConfig rdmaTargetConfig);

void UpdateSessionCache(TDeviceClient& client) const;
void RestoreSessions(TDeviceClient& client) const;
};

} // namespace NCloud::NBlockStore::NStorage
Loading

0 comments on commit a3c1cb3

Please sign in to comment.