diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_cleanup.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_cleanup.cpp index 40958a9cbbe..30dad1a6304 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_cleanup.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_cleanup.cpp @@ -41,6 +41,10 @@ void TIndexTabletActor::HandleCleanup( }; if (!CompactionStateLoadStatus.Finished) { + if (BlobIndexOpState.GetOperationState() == EOperationState::Enqueued) { + BlobIndexOpState.Complete(); + } + replyError(MakeError(E_TRY_AGAIN, "compaction state not loaded yet")); return; } diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_compaction.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_compaction.cpp index 87d950b5ac8..f6c8e4e6554 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_compaction.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_compaction.cpp @@ -411,6 +411,10 @@ void TIndexTabletActor::HandleCompaction( }; if (!CompactionStateLoadStatus.Finished) { + if (BlobIndexOpState.GetOperationState() == EOperationState::Enqueued) { + BlobIndexOpState.Complete(); + } + replyError(MakeError(E_TRY_AGAIN, "compaction state not loaded yet")); return; } diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp index 1bb30424a65..bd2198997b7 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp @@ -352,6 +352,12 @@ void TIndexTabletActor::HandleGetStorageStats( } } + stats->SetFlushState(static_cast(FlushState.GetOperationState())); + stats->SetBlobIndexOpState(static_cast( + BlobIndexOpState.GetOperationState())); + stats->SetCollectGarbageState(static_cast( + CollectGarbageState.GetOperationState())); + NCloud::Reply(ctx, *ev, std::move(response)); } diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_flush.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_flush.cpp index baf5ff40950..913f71c205b 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_flush.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_flush.cpp @@ -271,6 +271,10 @@ void TIndexTabletActor::HandleFlush( }; if (!CompactionStateLoadStatus.Finished) { + if (FlushState.GetOperationState() == EOperationState::Enqueued) { + FlushState.Complete(); + } + replyError( ctx, *ev, diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_flush_bytes.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_flush_bytes.cpp index 59a3bdd106b..6740e521cbc 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_flush_bytes.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_flush_bytes.cpp @@ -490,6 +490,10 @@ void TIndexTabletActor::HandleFlushBytes( }; if (!CompactionStateLoadStatus.Finished) { + if (BlobIndexOpState.GetOperationState() == EOperationState::Enqueued) { + BlobIndexOpState.Complete(); + } + replyError( ctx, *ev, diff --git a/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp b/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp index f7c3133ec3a..85214e8961f 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp @@ -2,6 +2,7 @@ #include "tablet_schema.h" #include +#include #include #include @@ -2882,6 +2883,126 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data) tablet.DestroyHandle(handle); } + TABLET_TEST(BackgroundOperationsShouldNotGetStuckForeverDuringCompactionMapLoading) + { + const auto block = tabletConfig.BlockSize; + + NProto::TStorageConfig storageConfig; + // hard to test anything apart from Compaction - it shares + // EOperationState with Cleanup and FlushBytes + storageConfig.SetCompactionThreshold(2); + // Flush has a separate EOperationState + storageConfig.SetFlushThreshold(1); + storageConfig.SetLoadedCompactionRangesPerTx(2); + storageConfig.SetWriteBlobThreshold(2 * block); + + TTestEnv env({}, std::move(storageConfig)); + + env.CreateSubDomain("nfs"); + + ui32 nodeIdx = env.CreateNode("nfs"); + ui64 tabletId = env.BootIndexTablet(nodeIdx); + + TIndexTabletClient tablet( + env.GetRuntime(), + nodeIdx, + tabletId, + tabletConfig); + tablet.InitSession("client", "session"); + + auto id = CreateNode(tablet, TCreateNodeArgs::File(RootNodeId, "test")); + auto handle = CreateHandle(tablet, id); + + // generating at least one compaction range + tablet.WriteData(handle, 0, block, 'a'); + + TAutoPtr loadChunk; + ui32 loadChunkCount = 0; + ui32 flushCount = 0; + ui32 compactionCount = 0; + env.GetRuntime().SetEventFilter([&] (auto& runtime, auto& event) { + Y_UNUSED(runtime); + + switch (event->GetTypeRewrite()) { + case TEvIndexTabletPrivate::EvFlushRequest: { + ++flushCount; + break; + } + + case TEvIndexTabletPrivate::EvCompactionRequest: { + ++compactionCount; + break; + } + + case TEvIndexTabletPrivate::EvLoadCompactionMapChunkRequest: { + ++loadChunkCount; + + // catching the second chunk - first one should be loaded + // so that we are able to write (and thus trigger our + // background ops) + if (loadChunkCount == 2) { + loadChunk = event.Release(); + return true; + } + } + } + + return false; + }); + + // rebooting to trigger compaction map reloading + tablet.RebootTablet(); + tablet.RecoverSession(); + + handle = CreateHandle(tablet, id); + + env.GetRuntime().DispatchEvents({}, TDuration::Seconds(1)); + UNIT_ASSERT(loadChunk); + UNIT_ASSERT_VALUES_EQUAL(2, loadChunkCount); + + // this write should succeed - it targets the range that should be + // loaded at this point of time + tablet.SendWriteDataRequest(handle, 0, block, 'a'); + { + auto response = tablet.RecvWriteDataResponse(); + UNIT_ASSERT_VALUES_EQUAL(S_OK, response->GetStatus()); + } + + // Flush should've been triggered and its operation state should've + // been reset to Idle + UNIT_ASSERT_VALUES_EQUAL(1, flushCount); + + { + auto response = tablet.GetStorageStats(); + const auto& stats = response->Record.GetStats(); + UNIT_ASSERT_VALUES_EQUAL( + static_cast(EOperationState::Idle), + static_cast(stats.GetFlushState())); + } + + // this write should succeed - it targets the range that should be + // loaded at this point of time + tablet.SendWriteDataRequest(handle, 0, 2 * block, 'a'); + { + auto response = tablet.RecvWriteDataResponse(); + UNIT_ASSERT_VALUES_EQUAL(S_OK, response->GetStatus()); + } + + // Compaction should've been triggered and its operation state should've + // been reset to Idle + UNIT_ASSERT_VALUES_EQUAL(1, compactionCount); + + { + auto response = tablet.GetStorageStats(); + const auto& stats = response->Record.GetStats(); + UNIT_ASSERT_VALUES_EQUAL( + static_cast(EOperationState::Idle), + static_cast(stats.GetBlobIndexOpState())); + } + + tablet.DestroyHandle(handle); + } + #undef TABLET_TEST } diff --git a/cloud/filestore/private/api/protos/tablet.proto b/cloud/filestore/private/api/protos/tablet.proto index 94e196e571e..1cb002e7150 100644 --- a/cloud/filestore/private/api/protos/tablet.proto +++ b/cloud/filestore/private/api/protos/tablet.proto @@ -125,6 +125,11 @@ message TStorageStats // compaction map range stats repeated TCompactionRangeStats CompactionRangeStats = 3000; + + // background operation states + uint32 FlushState = 4001; + uint32 BlobIndexOpState = 4002; + uint32 CollectGarbageState = 4003; } message TGetStorageStatsRequest