From f5e8ffb4528844ba8983e44c7ab5a7bc8d9c4aa1 Mon Sep 17 00:00:00 2001 From: qkrorlqr Date: Thu, 26 Sep 2024 19:09:06 +0000 Subject: [PATCH] issue-1795: backpressure for LargeDeletionMarkers --- cloud/filestore/config/storage.proto | 4 + .../libs/diagnostics/critical_events.h | 1 + cloud/filestore/libs/storage/core/config.cpp | 11 +- cloud/filestore/libs/storage/core/config.h | 1 + .../libs/storage/tablet/tablet_actor.h | 5 +- .../tablet/tablet_actor_allocatedata.cpp | 6 +- .../storage/tablet/tablet_actor_counters.cpp | 13 ++- .../tablet/tablet_actor_createhandle.cpp | 6 +- .../tablet/tablet_actor_destroyhandle.cpp | 10 +- .../tablet/tablet_actor_destroysession.cpp | 9 +- .../storage/tablet/tablet_actor_loadstate.cpp | 15 ++- .../tablet/tablet_actor_renamenode.cpp | 11 +- .../tablet/tablet_actor_resetsession.cpp | 9 +- .../tablet/tablet_actor_setnodeattr.cpp | 7 +- .../storage/tablet/tablet_actor_truncate.cpp | 12 +- .../tablet/tablet_actor_unlinknode.cpp | 7 +- .../storage/tablet/tablet_actor_zerorange.cpp | 11 +- .../libs/storage/tablet/tablet_database.cpp | 43 +++++++ .../libs/storage/tablet/tablet_database.h | 8 ++ .../storage/tablet/tablet_database_ut.cpp | 35 ++++++ .../libs/storage/tablet/tablet_schema.h | 16 ++- .../libs/storage/tablet/tablet_state.cpp | 12 ++ .../libs/storage/tablet/tablet_state.h | 26 ++++- .../libs/storage/tablet/tablet_state_data.cpp | 37 ++++-- .../libs/storage/tablet/tablet_state_impl.h | 1 + .../storage/tablet/tablet_state_nodes.cpp | 50 +++++--- .../filestore/libs/storage/tablet/tablet_tx.h | 11 +- .../libs/storage/tablet/tablet_ut_data.cpp | 108 ++++++++++++++++++ 28 files changed, 427 insertions(+), 58 deletions(-) diff --git a/cloud/filestore/config/storage.proto b/cloud/filestore/config/storage.proto index e761ecbef03..eb461370300 100644 --- a/cloud/filestore/config/storage.proto +++ b/cloud/filestore/config/storage.proto @@ -420,4 +420,8 @@ message TStorageConfig // settings for ydb config dispatcher service. optional NCloud.NProto.TConfigDispatcherSettings ConfigDispatcherSettings = 393; + + // If the number of blocks marked for deletion via large deletion markers + // exceeds this threshold, large truncate-like operations will be rejected. + optional uint64 LargeDeletionMarkersThresholdForBackpressure = 394; } diff --git a/cloud/filestore/libs/diagnostics/critical_events.h b/cloud/filestore/libs/diagnostics/critical_events.h index cbf49802045..cc00fb6706a 100644 --- a/cloud/filestore/libs/diagnostics/critical_events.h +++ b/cloud/filestore/libs/diagnostics/critical_events.h @@ -22,6 +22,7 @@ namespace NCloud::NFileStore{ xxx(AsyncDestroyHandleFailed) \ xxx(DuplicateRequestId) \ xxx(InvalidDupCacheEntry) \ + xxx(GeneratedOrphanNode) \ // FILESTORE_CRITICAL_EVENTS #define FILESTORE_IMPOSSIBLE_EVENTS(xxx) \ diff --git a/cloud/filestore/libs/storage/core/config.cpp b/cloud/filestore/libs/storage/core/config.cpp index 495f8e4a273..567e6ae79d1 100644 --- a/cloud/filestore/libs/storage/core/config.cpp +++ b/cloud/filestore/libs/storage/core/config.cpp @@ -50,11 +50,12 @@ using TAliases = NProto::TStorageConfig::TFilestoreAliases; xxx(MaxBlocksPerTruncateTx, ui32, 0 /*TODO: 32GiB/4KiB*/ )\ xxx(MaxTruncateTxInflight, ui32, 10 )\ \ - xxx(MaxFileBlocks, ui32, 300_GB / 4_KB )\ - xxx(LargeDeletionMarkersEnabled, bool, false )\ - xxx(LargeDeletionMarkerBlocks, ui64, 1_GB / 4_KB )\ - xxx(LargeDeletionMarkersThreshold, ui64, 128_GB / 4_KB )\ - xxx(LargeDeletionMarkersCleanupThreshold, ui64, 1_TB / 4_KB )\ + xxx(MaxFileBlocks, ui32, 300_GB / 4_KB )\ + xxx(LargeDeletionMarkersEnabled, bool, false )\ + xxx(LargeDeletionMarkerBlocks, ui64, 1_GB / 4_KB )\ + xxx(LargeDeletionMarkersThreshold, ui64, 128_GB / 4_KB )\ + xxx(LargeDeletionMarkersCleanupThreshold, ui64, 1_TB / 4_KB )\ + xxx(LargeDeletionMarkersThresholdForBackpressure, ui64, 10_TB / 4_KB )\ \ xxx(CompactionRetryTimeout, TDuration, TDuration::Seconds(1) )\ xxx(BlobIndexOpsPriority, \ diff --git a/cloud/filestore/libs/storage/core/config.h b/cloud/filestore/libs/storage/core/config.h index b6404d94e2b..0d788485514 100644 --- a/cloud/filestore/libs/storage/core/config.h +++ b/cloud/filestore/libs/storage/core/config.h @@ -267,6 +267,7 @@ class TStorageConfig ui64 GetLargeDeletionMarkerBlocks() const; ui64 GetLargeDeletionMarkersThreshold() const; ui64 GetLargeDeletionMarkersCleanupThreshold() const; + ui64 GetLargeDeletionMarkersThresholdForBackpressure() const; bool GetMultipleStageRequestThrottlingEnabled() const; diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor.h b/cloud/filestore/libs/storage/tablet/tablet_actor.h index 3686a3de5ce..cfd56ed595c 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor.h +++ b/cloud/filestore/libs/storage/tablet/tablet_actor.h @@ -151,6 +151,8 @@ class TIndexTabletActor final std::atomic NodesOpenForReadingBySingleSession{0}; std::atomic NodesOpenForReadingByMultipleSessions{0}; + std::atomic OrphanNodesCount{0}; + NMetrics::TDefaultWindowCalculator MaxUsedQuota{0}; using TLatHistogram = NMetrics::THistogram; @@ -216,7 +218,8 @@ class TIndexTabletActor final const TChannelsStats& channelsStats, const TReadAheadCacheStats& readAheadStats, const TNodeIndexCacheStats& nodeIndexCacheStats, - const TNodeToSessionCounters& nodeToSessionCounters); + const TNodeToSessionCounters& nodeToSessionCounters, + const TMiscNodeStats& miscNodeStats); } Metrics; const IProfileLogPtr ProfileLog; diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_allocatedata.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_allocatedata.cpp index 8e398582b10..0762da3c111 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_allocatedata.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_allocatedata.cpp @@ -199,11 +199,15 @@ void TIndexTabletActor::ExecuteTx_AllocateData( if (args.CommitId == InvalidCommitId) { return RebootTabletOnCommitOverflow(ctx, "AllocateData"); } - ZeroRange( + auto e = ZeroRange( db, args.NodeId, args.CommitId, TByteRange(args.Offset, minBorder - args.Offset, GetBlockSize())); + if (HasError(e)) { + args.Error = std::move(e); + return; + } } if (!needExtend) { diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp index 1ebe52099f3..4db8cc72d1a 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_counters.cpp @@ -322,6 +322,8 @@ void TIndexTabletActor::TMetrics::Register( NodesOpenForReadingByMultipleSessions, EMetricType::MT_ABSOLUTE); + REGISTER_AGGREGATABLE_SUM(OrphanNodesCount, EMetricType::MT_ABSOLUTE); + // Throttling REGISTER_LOCAL(MaxReadBandwidth, EMetricType::MT_ABSOLUTE); REGISTER_LOCAL(MaxWriteBandwidth, EMetricType::MT_ABSOLUTE); @@ -400,7 +402,8 @@ void TIndexTabletActor::TMetrics::Update( const TChannelsStats& channelsStats, const TReadAheadCacheStats& readAheadStats, const TNodeIndexCacheStats& nodeIndexCacheStats, - const TNodeToSessionCounters& nodeToSessionCounters) + const TNodeToSessionCounters& nodeToSessionCounters, + const TMiscNodeStats& miscNodeStats) { const ui32 blockSize = fileSystem.GetBlockSize(); @@ -472,6 +475,8 @@ void TIndexTabletActor::TMetrics::Update( NodesOpenForReadingByMultipleSessions, nodeToSessionCounters.NodesOpenForReadingByMultipleSessions); + Store(OrphanNodesCount, miscNodeStats.OrphanNodesCount); + BusyIdleCalc.OnUpdateStats(); } @@ -520,7 +525,8 @@ void TIndexTabletActor::RegisterStatCounters() CalculateChannelsStats(), CalculateReadAheadCacheStats(), CalculateNodeIndexCacheStats(), - GetNodeToSessionCounters()); + GetNodeToSessionCounters(), + GetMiscNodeStats()); Metrics.Register(fsId, storageMediaKind); } @@ -566,7 +572,8 @@ void TIndexTabletActor::HandleUpdateCounters( CalculateChannelsStats(), CalculateReadAheadCacheStats(), CalculateNodeIndexCacheStats(), - GetNodeToSessionCounters()); + GetNodeToSessionCounters(), + GetMiscNodeStats()); SendMetricsToExecutor(ctx); UpdateCountersScheduled = false; diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp index 12652022067..ef74c8a47b4 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_createhandle.cpp @@ -309,12 +309,16 @@ void TIndexTabletActor::ExecuteTx_CreateHandle( } else if (args.FollowerId.Empty() && HasFlag(args.Flags, NProto::TCreateHandleRequest::E_TRUNCATE)) { - Truncate( + auto e = Truncate( db, args.TargetNodeId, args.WriteCommitId, args.TargetNode->Attrs.GetSize(), 0); + if (HasError(e)) { + args.Error = std::move(e); + return; + } auto attrs = CopyAttrs(args.TargetNode->Attrs, E_CM_CMTIME); attrs.SetSize(0); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_destroyhandle.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_destroyhandle.cpp index e88ea160713..a4395fca245 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_destroyhandle.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_destroyhandle.cpp @@ -104,11 +104,19 @@ void TIndexTabletActor::ExecuteTx_DestroyHandle( if (args.Node->Attrs.GetLinks() == 0 && !HasOpenHandles(args.Node->NodeId)) { - RemoveNode( + auto e = RemoveNode( db, *args.Node, args.Node->MinCommitId, commitId); + + if (HasError(e)) { + WriteOrphanNode(db, TStringBuilder() + << "DestroyHandle: " << args.SessionId + << ", Handle: " << args.Request.GetHandle() + << ", RemoveNode: " << args.Node->NodeId + << ", Error: " << FormatError(e), args.Node->NodeId); + } } EnqueueTruncateIfNeeded(ctx); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_destroysession.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_destroysession.cpp index c79608df8a2..1e46210ead7 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_destroysession.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_destroysession.cpp @@ -308,11 +308,18 @@ void TIndexTabletActor::ExecuteTx_DestroySession( auto it = args.Nodes.find(nodeId); if (it != args.Nodes.end() && !HasOpenHandles(nodeId)) { - RemoveNode( + auto e = RemoveNode( db, *it, it->MinCommitId, commitId); + + if (HasError(e)) { + WriteOrphanNode(db, TStringBuilder() + << "DestroySession: " << args.SessionId + << ", RemoveNode: " << nodeId + << ", Error: " << FormatError(e), nodeId); + } } } diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp index 576b5c1e6b1..4c53ec35f09 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_loadstate.cpp @@ -112,6 +112,7 @@ bool TIndexTabletActor::PrepareTx_LoadState( db.ReadSessionHistoryEntries(args.SessionHistory), db.ReadOpLog(args.OpLog), db.ReadLargeDeletionMarkers(args.LargeDeletionMarkers), + db.ReadOrphanNodes(args.OrphanNodeIds), }; bool ready = std::accumulate( @@ -231,9 +232,16 @@ void TIndexTabletActor::CompleteTx_LoadState( LOG_INFO_S(ctx, TFileStoreComponents::TABLET, LogTag << " Initializing tablet state"); - LOG_INFO_S(ctx, TFileStoreComponents::TABLET, - LogTag << " Read " << args.LargeDeletionMarkers.size() - << " large deletion markers"); + if (args.LargeDeletionMarkers) { + LOG_INFO_S(ctx, TFileStoreComponents::TABLET, + LogTag << " Read " << args.LargeDeletionMarkers.size() + << " large deletion markers"); + } + if (args.OrphanNodeIds) { + LOG_INFO_S(ctx, TFileStoreComponents::TABLET, + LogTag << " Read " << args.OrphanNodeIds.size() + << " orphan nodes"); + } LoadState( Executor()->Generation(), @@ -242,6 +250,7 @@ void TIndexTabletActor::CompleteTx_LoadState( args.FileSystemStats, args.TabletStorageInfo, args.LargeDeletionMarkers, + args.OrphanNodeIds, config); UpdateLogTag(); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_renamenode.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_renamenode.cpp index b4a16465a75..451adf5f318 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_renamenode.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_renamenode.cpp @@ -322,13 +322,22 @@ void TIndexTabletActor::ExecuteTx_RenameNode( } // remove target ref and unlink target node - UnlinkNode( + auto e = UnlinkNode( db, args.NewParentNode->NodeId, args.NewName, *args.NewChildNode, args.NewChildRef->MinCommitId, args.CommitId); + + if (HasError(e)) { + const auto nodeId = args.NewChildNode->NodeId; + WriteOrphanNode(db, TStringBuilder() + << "RenameNode: " << args.SessionId + << ", ParentNodeId: " << args.NewParentNode->NodeId + << ", NodeId: " << nodeId + << ", Error: " << FormatError(e), nodeId); + } } else { // remove target ref UnlinkExternalNode( diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_resetsession.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_resetsession.cpp index 4445cde0808..21d6cf98c4d 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_resetsession.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_resetsession.cpp @@ -142,11 +142,18 @@ void TIndexTabletActor::ExecuteTx_ResetSession( nodeId, it->Attrs.GetSize()); - RemoveNode( + auto e = RemoveNode( db, *it, it->MinCommitId, commitId); + + if (HasError(e)) { + WriteOrphanNode(db, TStringBuilder() + << "DestroySession: " << args.SessionId + << ", RemoveNode: " << nodeId + << ", Error: " << FormatError(e), nodeId); + } } } diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_setnodeattr.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_setnodeattr.cpp index d1b49ddbfe5..bc7db996d84 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_setnodeattr.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_setnodeattr.cpp @@ -167,13 +167,18 @@ void TIndexTabletActor::ExecuteTx_SetNodeAttr( attrs.SetCTime(update.GetCTime()); } if (HasFlag(flags, NProto::TSetNodeAttrRequest::F_SET_ATTR_SIZE)) { - Truncate( + auto e = Truncate( db, args.NodeId, args.CommitId, attrs.GetSize(), update.GetSize()); + if (HasError(e)) { + args.Error = e; + return; + } + attrs.SetSize(update.GetSize()); } diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_truncate.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_truncate.cpp index 7a2f116cb75..c97e78dcbec 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_truncate.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_truncate.cpp @@ -316,13 +316,17 @@ void TIndexTabletActor::ExecuteTx_TruncateRange( return RebootTabletOnCommitOverflow(ctx, "TruncateRange"); } + auto e = TruncateRange(db, args.NodeId, commitId, args.Range); + if (HasError(e)) { + args.Error = std::move(e); + return; + } + AddRange( args.NodeId, args.Range.Offset, args.Range.Length, args.ProfileLogRequest); - - TruncateRange(db, args.NodeId, commitId, args.Range); } void TIndexTabletActor::CompleteTx_TruncateRange( @@ -343,7 +347,9 @@ void TIndexTabletActor::CompleteTx_TruncateRange( args.NodeId, args.Range.Describe().c_str()); - auto response = std::make_unique(); + auto response = + std::make_unique( + std::move(args.Error)); NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); EnqueueCollectGarbageIfNeeded(ctx); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_unlinknode.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_unlinknode.cpp index a03677707ab..27ac03b21e5 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_unlinknode.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_unlinknode.cpp @@ -383,13 +383,18 @@ void TIndexTabletActor::ExecuteTx_UnlinkNode( db.WriteOpLogEntry(args.OpLogEntry); } else { - UnlinkNode( + auto e = UnlinkNode( db, args.ParentNodeId, args.Name, *args.ChildNode, args.ChildRef->MinCommitId, args.CommitId); + + if (HasError(e)) { + args.Error = std::move(e); + return; + } } auto* session = FindSession(args.SessionId); diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_zerorange.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_zerorange.cpp index f0a6baf1c87..f296b96ff6a 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_zerorange.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_zerorange.cpp @@ -84,7 +84,7 @@ void TIndexTabletActor::ExecuteTx_ZeroRange( args.Range.Length, args.ProfileLogRequest); - ZeroRange(db, args.NodeId, commitId, args.Range); + args.Error = ZeroRange(db, args.NodeId, commitId, args.Range); } void TIndexTabletActor::CompleteTx_ZeroRange( @@ -100,12 +100,15 @@ void TIndexTabletActor::CompleteTx_ZeroRange( ProfileLog); LOG_DEBUG(ctx, TFileStoreComponents::TABLET, - "%s ZeroRange %lu %s completed", + "%s ZeroRange %lu %s completed: %s", LogTag.c_str(), args.NodeId, - args.Range.Describe().c_str()); + args.Range.Describe().c_str(), + FormatError(args.Error).Quote().c_str()); - auto response = std::make_unique(); + auto response = + std::make_unique( + std::move(args.Error)); NCloud::Reply(ctx, *args.RequestInfo, std::move(response)); EnqueueCollectGarbageIfNeeded(ctx); diff --git a/cloud/filestore/libs/storage/tablet/tablet_database.cpp b/cloud/filestore/libs/storage/tablet/tablet_database.cpp index 27bcece8ec2..cc5f9318869 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_database.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_database.cpp @@ -1404,6 +1404,49 @@ bool TIndexTabletDatabase::ReadLargeDeletionMarkers( return true; } +//////////////////////////////////////////////////////////////////////////////// +// OrphanNodes + +void TIndexTabletDatabase::WriteOrphanNode(ui64 nodeId) +{ + using TTable = TIndexTabletSchema::OrphanNodes; + + Table() + .Key(nodeId) + .Update(); +} + +void TIndexTabletDatabase::DeleteOrphanNode(ui64 nodeId) +{ + using TTable = TIndexTabletSchema::OrphanNodes; + + Table() + .Key(nodeId) + .Delete(); +} + +bool TIndexTabletDatabase::ReadOrphanNodes(TVector& nodeIds) +{ + using TTable = TIndexTabletSchema::OrphanNodes; + + auto it = Table() + .Select(); + + if (!it.IsReady()) { + return false; // not ready + } + + while (it.IsValid()) { + nodeIds.emplace_back(it.GetValue()); + + if (!it.Next()) { + return false; // not ready + } + } + + return true; +} + //////////////////////////////////////////////////////////////////////////////// // NewBlobs diff --git a/cloud/filestore/libs/storage/tablet/tablet_database.h b/cloud/filestore/libs/storage/tablet/tablet_database.h index edd3f95dd8a..af7a9ae5cb7 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_database.h +++ b/cloud/filestore/libs/storage/tablet/tablet_database.h @@ -427,6 +427,14 @@ FILESTORE_FILESYSTEM_STATS(FILESTORE_DECLARE_STATS) bool ReadLargeDeletionMarkers(TVector& deletionMarkers); + // + // OrphanNodes + // + + void WriteOrphanNode(ui64 nodeId); + void DeleteOrphanNode(ui64 nodeId); + bool ReadOrphanNodes(TVector& nodeIds); + // // NewBlobs // diff --git a/cloud/filestore/libs/storage/tablet/tablet_database_ut.cpp b/cloud/filestore/libs/storage/tablet/tablet_database_ut.cpp index 66f064c7047..5020a24f845 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_database_ut.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_database_ut.cpp @@ -511,6 +511,41 @@ Y_UNIT_TEST_SUITE(TIndexTabletDatabaseTest) UNIT_ASSERT_VALUES_EQUAL(toString(entries), toString(markers)); }); } + + Y_UNIT_TEST(ShouldStoreOrphanNodes) + { + TTestExecutor executor; + executor.WriteTx([&] (TIndexTabletDatabase db) { + db.InitSchema(false); + }); + + executor.WriteTx([&] (TIndexTabletDatabase db) { + db.WriteOrphanNode(111); + db.WriteOrphanNode(222); + db.WriteOrphanNode(333); + }); + + executor.ReadTx([&] (TIndexTabletDatabase db) { + TVector nodeIds; + UNIT_ASSERT(db.ReadOrphanNodes(nodeIds)); + UNIT_ASSERT_VALUES_EQUAL(3, nodeIds.size()); + UNIT_ASSERT_VALUES_EQUAL(111, nodeIds[0]); + UNIT_ASSERT_VALUES_EQUAL(222, nodeIds[1]); + UNIT_ASSERT_VALUES_EQUAL(333, nodeIds[2]); + }); + + executor.WriteTx([&] (TIndexTabletDatabase db) { + db.DeleteOrphanNode(222); + }); + + executor.ReadTx([&] (TIndexTabletDatabase db) { + TVector nodeIds; + UNIT_ASSERT(db.ReadOrphanNodes(nodeIds)); + UNIT_ASSERT_VALUES_EQUAL(2, nodeIds.size()); + UNIT_ASSERT_VALUES_EQUAL(111, nodeIds[0]); + UNIT_ASSERT_VALUES_EQUAL(333, nodeIds[1]); + }); + } } } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_schema.h b/cloud/filestore/libs/storage/tablet/tablet_schema.h index 74a67747f99..a0fb668dad3 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_schema.h +++ b/cloud/filestore/libs/storage/tablet/tablet_schema.h @@ -533,6 +533,19 @@ struct TIndexTabletSchema using CompactionPolicy = TCompactionPolicy; }; + struct OrphanNodes: TTableSchema<27> + { + struct NodeId : Column<1, NKikimr::NScheme::NTypeIds::Uint64> {}; + + using TKey = TableKey; + + using TColumns = TableColumns< + NodeId + >; + + using StoragePolicy = TStoragePolicy; + }; + using TTables = SchemaTables< FileSystem, Sessions, @@ -559,7 +572,8 @@ struct TIndexTabletSchema TruncateQueue, SessionHistory, OpLog, - LargeDeletionMarkers + LargeDeletionMarkers, + OrphanNodes >; using TSettings = SchemaSettings< diff --git a/cloud/filestore/libs/storage/tablet/tablet_state.cpp b/cloud/filestore/libs/storage/tablet/tablet_state.cpp index bda81044bcf..bedcc447452 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_state.cpp @@ -83,6 +83,7 @@ void TIndexTabletState::LoadState( const NProto::TFileSystemStats& fileSystemStats, const NCloud::NProto::TTabletStorageInfo& tabletStorageInfo, const TVector& largeDeletionMarkers, + const TVector& orphanNodeIds, const TThrottlerConfig& throttlerConfig) { Generation = generation; @@ -102,6 +103,8 @@ void TIndexTabletState::LoadState( LargeDeletionMarkersThreshold = config.GetLargeDeletionMarkersThreshold(); LargeDeletionMarkersCleanupThreshold = config.GetLargeDeletionMarkersCleanupThreshold(); + LargeDeletionMarkersThresholdForBackpressure = + config.GetLargeDeletionMarkersThresholdForBackpressure(); FileSystem.CopyFrom(fileSystem); FileSystemStats.CopyFrom(fileSystemStats); @@ -130,6 +133,8 @@ void TIndexTabletState::LoadState( for (const auto& deletionMarker: largeDeletionMarkers) { Impl->LargeBlocks.AddDeletionMarker(deletionMarker); } + + Impl->OrphanNodeIds.insert(orphanNodeIds.begin(), orphanNodeIds.end()); } void TIndexTabletState::UpdateConfig( @@ -170,4 +175,11 @@ void TIndexTabletState::DumpStats(IOutputStream& os) const ); } +TMiscNodeStats TIndexTabletState::GetMiscNodeStats() const +{ + return { + .OrphanNodesCount = static_cast(Impl->OrphanNodeIds.size()), + }; +} + } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_state.h b/cloud/filestore/libs/storage/tablet/tablet_state.h index 3ec6e556176..f4ee8a954e6 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state.h +++ b/cloud/filestore/libs/storage/tablet/tablet_state.h @@ -143,6 +143,11 @@ struct TNodeToSessionCounters i64 NodesOpenForReadingByMultipleSessions{0}; }; +struct TMiscNodeStats +{ + i64 OrphanNodesCount{0}; +}; + //////////////////////////////////////////////////////////////////////////////// class TIndexTabletState @@ -171,6 +176,7 @@ class TIndexTabletState /*const*/ ui64 LargeDeletionMarkerBlocks = 0; /*const*/ ui64 LargeDeletionMarkersThreshold = 0; /*const*/ ui64 LargeDeletionMarkersCleanupThreshold = 0; + /*const*/ ui64 LargeDeletionMarkersThresholdForBackpressure = 0; bool StateLoaded = false; @@ -190,6 +196,7 @@ class TIndexTabletState const NProto::TFileSystemStats& fileSystemStats, const NCloud::NProto::TTabletStorageInfo& tabletStorageInfo, const TVector& largeDeletionMarkers, + const TVector& orphanNodeIds, const TThrottlerConfig& throttlerConfig); bool IsStateLoaded() const @@ -275,6 +282,8 @@ class TIndexTabletState return NodeToSessionCounters; } + TMiscNodeStats GetMiscNodeStats() const; + const NProto::TFileStorePerformanceProfile& GetPerformanceProfile() const; const TFileStoreAllocRegistry& GetFileStoreProfilingRegistry() const @@ -380,13 +389,13 @@ FILESTORE_FILESYSTEM_STATS(FILESTORE_DECLARE_COUNTER) const NProto::TNode& attrs, const NProto::TNode& prevAttrs); - void RemoveNode( + [[nodiscard]] NProto::TError RemoveNode( TIndexTabletDatabase& db, const IIndexTabletDatabase::TNode& node, ui64 minCommitId, ui64 maxCommitId); - void UnlinkNode( + [[nodiscard]] NProto::TError UnlinkNode( TIndexTabletDatabase& db, ui64 parentNodeId, const TString& name, @@ -423,6 +432,11 @@ FILESTORE_FILESYSTEM_STATS(FILESTORE_DECLARE_COUNTER) bool HasBlocksLeft( ui32 blocks) const; + void WriteOrphanNode( + TIndexTabletDatabase& db, + const TString& message, + ui64 nodeId); + private: void UpdateUsedBlocksCount( TIndexTabletDatabase& db, @@ -1197,7 +1211,7 @@ FILESTORE_DUPCACHE_REQUESTS(FILESTORE_DECLARE_DUPCACHE) void AddTruncate(TIndexTabletDatabase& db, ui64 nodeId, TByteRange range); void DeleteTruncate(TIndexTabletDatabase& db, ui64 nodeId); - void Truncate( + [[nodiscard]] NProto::TError Truncate( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, @@ -1210,7 +1224,7 @@ FILESTORE_DUPCACHE_REQUESTS(FILESTORE_DECLARE_DUPCACHE) // - aligns up range in the tail; // - deletes all blocks in NEW range; // - writes fresh bytes (zeroes) on unaligned head, if range.Offset != 0. - void TruncateRange( + [[nodiscard]] NProto::TError TruncateRange( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, @@ -1220,14 +1234,14 @@ FILESTORE_DUPCACHE_REQUESTS(FILESTORE_DECLARE_DUPCACHE) // resizing the node. This function: // - writes fresh bytes (zeroes) on unaligned head, if any; // - writes fresh bytes (zeroes) on unaligned tail, if any. - void ZeroRange( + [[nodiscard]] NProto::TError ZeroRange( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, TByteRange range); private: - void DeleteRange( + [[nodiscard]] NProto::TError DeleteRange( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_data.cpp b/cloud/filestore/libs/storage/tablet/tablet_state_data.cpp index 1f6f3dd48c0..d8acdc4d9de 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_data.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_state_data.cpp @@ -74,7 +74,7 @@ bool TIndexTabletState::GenerateBlobId( return true; } -void TIndexTabletState::Truncate( +NProto::TError TIndexTabletState::Truncate( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, @@ -82,20 +82,20 @@ void TIndexTabletState::Truncate( ui64 targetSize) { if (currentSize <= targetSize) { - return; + return {}; } TByteRange range(targetSize, currentSize - targetSize, GetBlockSize()); if (TruncateBlocksThreshold && range.BlockCount() > TruncateBlocksThreshold) { EnqueueTruncateOp(nodeId, range); - return; + return {}; } - TruncateRange(db, nodeId, commitId, range); + return TruncateRange(db, nodeId, commitId, range); } -void TIndexTabletState::TruncateRange( +NProto::TError TIndexTabletState::TruncateRange( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, @@ -106,7 +106,10 @@ void TIndexTabletState::TruncateRange( AlignUp(range.End(), range.BlockSize) - range.Offset, range.BlockSize); - DeleteRange(db, nodeId, commitId, tailAlignedRange); + auto e = DeleteRange(db, nodeId, commitId, tailAlignedRange); + if (HasError(e)) { + return e; + } const TByteRange headBound( range.Offset, @@ -123,15 +126,20 @@ void TIndexTabletState::TruncateRange( } InvalidateReadAheadCache(nodeId); + + return {}; } -void TIndexTabletState::ZeroRange( +NProto::TError TIndexTabletState::ZeroRange( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, TByteRange range) { - DeleteRange(db, nodeId, commitId, range); + auto e = DeleteRange(db, nodeId, commitId, range); + if (HasError(e)) { + return e; + } const TByteRange headBound( range.Offset, @@ -160,9 +168,11 @@ void TIndexTabletState::ZeroRange( // FIXME: do not allocate each time TString(tailBound.Length, 0)); } + + return {}; } -void TIndexTabletState::DeleteRange( +NProto::TError TIndexTabletState::DeleteRange( TIndexTabletDatabase& db, ui64 nodeId, ui64 commitId, @@ -180,6 +190,13 @@ void TIndexTabletState::DeleteRange( const bool useLargeDeletionMarkers = LargeDeletionMarkersEnabled && deletedBlockCount >= LargeDeletionMarkersThreshold; if (useLargeDeletionMarkers) { + const auto t = LargeDeletionMarkersThresholdForBackpressure; + if (GetLargeDeletionMarkersCount() >= t) { + return MakeError(E_REJECTED, TStringBuilder() + << "too many large deletion markers: " + << GetLargeDeletionMarkersCount() << " >= " << t); + } + SplitRange( range.FirstAlignedBlock(), deletedBlockCount, @@ -221,6 +238,8 @@ void TIndexTabletState::DeleteRange( commitId, range.Offset, range.Length); + + return {}; } void TIndexTabletState::EnqueueTruncateOp(ui64 nodeId, TByteRange range) diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_impl.h b/cloud/filestore/libs/storage/tablet/tablet_state_impl.h index bfdf07c22ab..e4847015859 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_impl.h +++ b/cloud/filestore/libs/storage/tablet/tablet_state_impl.h @@ -59,6 +59,7 @@ struct TIndexTabletState::TImpl TReadAheadCache ReadAheadCache; TNodeIndexCache NodeIndexCache; TInMemoryIndexState InMemoryIndexState; + TSet OrphanNodeIds; TCheckpointStore Checkpoints; TChannels Channels; diff --git a/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp b/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp index 331d739b53e..10bc8b89276 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_state_nodes.cpp @@ -98,12 +98,27 @@ void TIndexTabletState::UpdateNode( InvalidateNodeIndexCache(nodeId); } -void TIndexTabletState::RemoveNode( +NProto::TError TIndexTabletState::RemoveNode( TIndexTabletDatabase& db, const IIndexTabletDatabase::TNode& node, ui64 minCommitId, ui64 maxCommitId) { + // SymLinks have size (equal to TargetPath) but store no real data so there + // is no need to write deletion markers upon SymLink removal + if (!node.Attrs.GetSymLink()) { + auto e = Truncate( + db, + node.NodeId, + maxCommitId, + node.Attrs.GetSize(), + 0); + + if (HasError(e)) { + return e; + } + } + db.DeleteNode(node.NodeId); DecrementUsedNodesCount(db); @@ -116,21 +131,12 @@ void TIndexTabletState::RemoveNode( AddCheckpointNode(db, checkpointId, node.NodeId); } - // SymLinks have size (equal to TargetPath) but store no real data so there - // is no need to write deletion markers upon SymLink removal - if (!node.Attrs.GetSymLink()) { - Truncate( - db, - node.NodeId, - maxCommitId, - node.Attrs.GetSize(), - 0); - } - InvalidateNodeIndexCache(node.NodeId); + + return {}; } -void TIndexTabletState::UnlinkNode( +NProto::TError TIndexTabletState::UnlinkNode( TIndexTabletDatabase& db, ui64 parentNodeId, const TString& name, @@ -148,11 +154,15 @@ void TIndexTabletState::UnlinkNode( attrs, node.Attrs); } else { - RemoveNode( + auto e = RemoveNode( db, node, minCommitId, maxCommitId); + + if (HasError(e)) { + return e; + } } RemoveNodeRef( @@ -165,6 +175,8 @@ void TIndexTabletState::UnlinkNode( "", // followerId "" // followerName ); + + return {}; } void TIndexTabletState::UnlinkExternalNode( @@ -232,6 +244,16 @@ void TIndexTabletState::RewriteNode( InvalidateNodeIndexCache(nodeId); } +void TIndexTabletState::WriteOrphanNode( + TIndexTabletDatabase& db, + const TString& message, + ui64 nodeId) +{ + ReportGeneratedOrphanNode(message); + db.WriteOrphanNode(nodeId); + Impl->OrphanNodeIds.insert(nodeId); +} + //////////////////////////////////////////////////////////////////////////////// // NodeAttrs diff --git a/cloud/filestore/libs/storage/tablet/tablet_tx.h b/cloud/filestore/libs/storage/tablet/tablet_tx.h index 240cd262f64..ef24699378b 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_tx.h +++ b/cloud/filestore/libs/storage/tablet/tablet_tx.h @@ -170,7 +170,8 @@ struct TIndexStateNodeUpdates TVector NodeUpdates; }; -struct TProfileAware { +struct TProfileAware +{ NProto::TProfileLogRequestInfo ProfileLogRequest; explicit TProfileAware(EFileStoreSystemRequest requestType) noexcept @@ -308,6 +309,7 @@ struct TTxIndexTablet TVector SessionHistory; TVector OpLog; TVector LargeDeletionMarkers; + TVector OrphanNodeIds; NProto::TError Error; @@ -331,6 +333,7 @@ struct TTxIndexTablet SessionHistory.clear(); OpLog.clear(); LargeDeletionMarkers.clear(); + OrphanNodeIds.clear(); } }; @@ -1778,6 +1781,8 @@ struct TTxIndexTablet const ui64 NodeId; const TByteRange Range; + NProto::TError Error; + TTruncateRange( TRequestInfoPtr requestInfo, ui64 nodeId, @@ -1791,6 +1796,7 @@ struct TTxIndexTablet void Clear() { TProfileAware::Clear(); + Error.Clear(); } }; @@ -1826,6 +1832,8 @@ struct TTxIndexTablet const ui64 NodeId; const TByteRange Range; + NProto::TError Error; + TZeroRange( TRequestInfoPtr requestInfo, ui64 nodeId, @@ -1839,6 +1847,7 @@ struct TTxIndexTablet void Clear() { TProfileAware::Clear(); + Error.Clear(); } }; diff --git a/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp b/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp index 490a7dc6eae..6c8682b89fa 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp @@ -6086,6 +6086,8 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data) storageConfig.SetLargeDeletionMarkerBlocks(1_GB / block); storageConfig.SetLargeDeletionMarkersThreshold(128_GB / block); storageConfig.SetLargeDeletionMarkersCleanupThreshold(3_TB / block); + storageConfig.SetLargeDeletionMarkersThresholdForBackpressure( + 10_TB / block); const auto blobSize = 2 * block; storageConfig.SetWriteBlobThreshold(blobSize); @@ -6217,6 +6219,112 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data) } } + TABLET_TEST(ShouldRejectLargeFileTruncationIfLargeDeletionMarkerCountIsTooHigh) + { + const auto block = tabletConfig.BlockSize; + + NProto::TStorageConfig storageConfig; + storageConfig.SetMaxFileBlocks(5_TB / block); + storageConfig.SetLargeDeletionMarkersEnabled(true); + storageConfig.SetLargeDeletionMarkerBlocks(1_GB / block); + storageConfig.SetLargeDeletionMarkersThreshold(128_GB / block); + storageConfig.SetLargeDeletionMarkersCleanupThreshold(20_TB / block); + storageConfig.SetLargeDeletionMarkersThresholdForBackpressure( + 4_TB / block); + const auto blobSize = 2 * block; + storageConfig.SetWriteBlobThreshold(blobSize); + + TTestEnv env({}, storageConfig); + env.CreateSubDomain("nfs"); + + ui32 nodeIdx = env.CreateNode("nfs"); + ui64 tabletId = env.BootIndexTablet(nodeIdx); + + tabletConfig.BlockCount = 10_TB / block; + + TIndexTabletClient tablet( + env.GetRuntime(), + nodeIdx, + tabletId, + tabletConfig); + tablet.InitSession("client", "session"); + + auto id1 = + CreateNode(tablet, TCreateNodeArgs::File(RootNodeId, "test1")); + auto id2 = + CreateNode(tablet, TCreateNodeArgs::File(RootNodeId, "test2")); + CreateNode(tablet, TCreateNodeArgs::File(RootNodeId, "test3")); + + for (const auto id: {id1, id2}) { + TSetNodeAttrArgs args(id); + args.SetFlag(NProto::TSetNodeAttrRequest::F_SET_ATTR_SIZE); + args.SetSize(5_TB); + tablet.SetNodeAttr(args); + UNIT_ASSERT_VALUES_EQUAL(5_TB, GetNodeAttrs(tablet, id).GetSize()); + } + + { + tablet.SendUnlinkNodeRequest(RootNodeId, "test1", false); + auto response = tablet.RecvUnlinkNodeResponse(); + UNIT_ASSERT_VALUES_EQUAL_C( + S_OK, + response->GetStatus(), + response->GetErrorReason()); + } + + { + tablet.SendUnlinkNodeRequest(RootNodeId, "test2", false); + auto response = tablet.RecvUnlinkNodeResponse(); + UNIT_ASSERT_VALUES_EQUAL_C( + E_REJECTED, + response->GetStatus(), + response->GetErrorReason()); + } + + { + auto response = tablet.GetStorageStats(); + const auto& stats = response->Record.GetStats(); + UNIT_ASSERT_VALUES_EQUAL( + 5_TB / block, + stats.GetLargeDeletionMarkersCount()); + UNIT_ASSERT_VALUES_EQUAL(2, stats.GetUsedNodesCount()); + } + + tablet.AdvanceTime(TDuration::Seconds(15)); + env.GetRuntime().DispatchEvents({}, TDuration::Seconds(5)); + + auto registry = env.GetRegistry(); + { + TTestRegistryVisitor visitor; + registry->Visit(TInstant::Zero(), visitor); + visitor.ValidateExpectedCounters({ + {{{"sensor", "OrphanNodesCount"}, {"filesystem", "test"}}, 0}, + }); + } + + tablet.RenameNode(RootNodeId, "test3", RootNodeId, "test2"); + + tablet.AdvanceTime(TDuration::Seconds(15)); + env.GetRuntime().DispatchEvents({}, TDuration::Seconds(5)); + + { + TTestRegistryVisitor visitor; + registry->Visit(TInstant::Zero(), visitor); + visitor.ValidateExpectedCounters({ + {{{"sensor", "OrphanNodesCount"}, {"filesystem", "test"}}, 1}, + }); + } + + { + auto response = tablet.GetStorageStats(); + const auto& stats = response->Record.GetStats(); + UNIT_ASSERT_VALUES_EQUAL( + 5_TB / block, + stats.GetLargeDeletionMarkersCount()); + UNIT_ASSERT_VALUES_EQUAL(2, stats.GetUsedNodesCount()); + } + } + TABLET_TEST_4K_ONLY(ShouldHandleRangeIdCollisionsInCompactionMapStats) { const auto block = tabletConfig.BlockSize;