From 609bd0150fcdcef275aa45536d3b65950798188c Mon Sep 17 00:00:00 2001 From: Sachin Kale Date: Mon, 16 Sep 2024 15:50:11 +0530 Subject: [PATCH] Optimize GC flow with pinned timestamps Signed-off-by: Sachin Kale --- ...rePinnedTimestampsGarbageCollectionIT.java | 435 ++++++++++++++++++ .../snapshots/DeleteSnapshotV2IT.java | 242 ++++++++-- .../store/RemoteSegmentStoreDirectory.java | 23 +- .../RemoteFsTimestampAwareTranslog.java | 162 ++++--- .../transfer/TranslogTransferMetadata.java | 10 + .../RemoteStorePinnedTimestampService.java | 56 ++- .../blobstore/BlobStoreRepository.java | 69 ++- .../snapshots/SnapshotsService.java | 10 +- .../RemoteSegmentStoreDirectoryTests.java | 4 +- .../RemoteFsTimestampAwareTranslogTests.java | 6 +- 10 files changed, 870 insertions(+), 147 deletions(-) create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStorePinnedTimestampsGarbageCollectionIT.java diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStorePinnedTimestampsGarbageCollectionIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStorePinnedTimestampsGarbageCollectionIT.java new file mode 100644 index 0000000000000..718c966619247 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStorePinnedTimestampsGarbageCollectionIT.java @@ -0,0 +1,435 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore; + +import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.collect.Tuple; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.unit.TimeValue; +import org.opensearch.core.action.ActionListener; +import org.opensearch.index.translog.transfer.TranslogTransferMetadata; +import org.opensearch.indices.RemoteStoreSettings; +import org.opensearch.node.remotestore.RemoteStorePinnedTimestampService; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.LongStream; + +import static org.opensearch.index.IndexSettings.INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING; +import static org.opensearch.index.remote.RemoteStoreEnums.DataCategory.TRANSLOG; +import static org.opensearch.index.remote.RemoteStoreEnums.DataType.DATA; +import static org.opensearch.index.remote.RemoteStoreEnums.DataType.METADATA; + +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0) +public class RemoteStorePinnedTimestampsGarbageCollectionIT extends RemoteStoreBaseIntegTestCase { + static final String INDEX_NAME = "remote-store-test-idx-1"; + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + return Settings.builder() + .put(super.nodeSettings(nodeOrdinal)) + .put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED.getKey(), true) + .put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_METADATA.getKey(), false) + .build(); + } + + private void keepPinnedTimestampSchedulerUpdated() throws InterruptedException { + long currentTime = System.currentTimeMillis(); + int maxRetry = 10; + while (maxRetry > 0 && RemoteStorePinnedTimestampService.getPinnedTimestamps().v1() <= currentTime) { + Thread.sleep(1000); + maxRetry--; + } + } + + ActionListener noOpActionListener = new ActionListener<>() { + @Override + public void onResponse(Void unused) {} + + @Override + public void onFailure(Exception e) {} + }; + + public void testLiveIndexNoPinnedTimestamps() throws Exception { + prepareCluster(1, 1, Settings.EMPTY); + Settings indexSettings = Settings.builder() + .put(remoteStoreIndexSettings(0, 1)) + .put(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING.getKey(), 0) + .build(); + createIndex(INDEX_NAME, indexSettings); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + primaryNodeName(INDEX_NAME) + ); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + int numDocs = randomIntBetween(5, 10); + for (int i = 0; i < numDocs; i++) { + keepPinnedTimestampSchedulerUpdated(); + indexSingleDoc(INDEX_NAME, true); + } + + String translogPathFixedPrefix = RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX.get(getNodeSettings()); + String shardDataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + DATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogDataPath = Path.of(translogRepoPath + "/" + shardDataPath + "/1"); + String shardMetadataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + METADATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogMetadataPath = Path.of(translogRepoPath + "/" + shardMetadataPath); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(1, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + } + + public void testLiveIndexNoPinnedTimestampsWithExtraGenSettingWithinLimit() throws Exception { + prepareCluster(1, 1, Settings.EMPTY); + Settings indexSettings = Settings.builder() + .put(remoteStoreIndexSettings(0, 1)) + .put(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING.getKey(), 10) + .build(); + createIndex(INDEX_NAME, indexSettings); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + primaryNodeName(INDEX_NAME) + ); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + int numDocs = randomIntBetween(5, 9); + for (int i = 0; i < numDocs; i++) { + keepPinnedTimestampSchedulerUpdated(); + indexSingleDoc(INDEX_NAME, true); + } + + String translogPathFixedPrefix = RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX.get(getNodeSettings()); + String shardDataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + DATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogDataPath = Path.of(translogRepoPath + "/" + shardDataPath + "/1"); + String shardMetadataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + METADATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogMetadataPath = Path.of(translogRepoPath + "/" + shardMetadataPath); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(numDocs + 1, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + } + + public void testLiveIndexNoPinnedTimestampsWithExtraGenSetting() throws Exception { + prepareCluster(1, 1, Settings.EMPTY); + Settings indexSettings = Settings.builder() + .put(remoteStoreIndexSettings(0, 1)) + .put(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING.getKey(), 3) + .build(); + createIndex(INDEX_NAME, indexSettings); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + primaryNodeName(INDEX_NAME) + ); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + int numDocs = 5; + for (int i = 0; i < numDocs; i++) { + keepPinnedTimestampSchedulerUpdated(); + indexSingleDoc(INDEX_NAME, true); + } + + String translogPathFixedPrefix = RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX.get(getNodeSettings()); + String shardDataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + DATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogDataPath = Path.of(translogRepoPath + "/" + shardDataPath + "/1"); + String shardMetadataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + METADATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogMetadataPath = Path.of(translogRepoPath + "/" + shardMetadataPath); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(3, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + } + + public void testLiveIndexWithPinnedTimestamps() throws Exception { + prepareCluster(1, 1, Settings.EMPTY); + Settings indexSettings = Settings.builder() + .put(remoteStoreIndexSettings(0, 1)) + .put(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING.getKey(), 0) + .build(); + createIndex(INDEX_NAME, indexSettings); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + primaryNodeName(INDEX_NAME) + ); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + int numDocs = randomIntBetween(5, 10); + for (int i = 0; i < numDocs; i++) { + keepPinnedTimestampSchedulerUpdated(); + indexSingleDoc(INDEX_NAME, true); + if (i == 2) { + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.timeValueMinutes(1)); + remoteStorePinnedTimestampService.pinTimestamp(System.currentTimeMillis(), "xyz", noOpActionListener); + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + } + } + + String translogPathFixedPrefix = RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX.get(getNodeSettings()); + String shardDataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + DATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogDataPath = Path.of(translogRepoPath + "/" + shardDataPath + "/1"); + String shardMetadataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + METADATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogMetadataPath = Path.of(translogRepoPath + "/" + shardMetadataPath); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(2, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + } + + public void testIndexDeletionNoPinnedTimestamps() throws Exception { + prepareCluster(1, 1, Settings.EMPTY); + Settings indexSettings = Settings.builder() + .put(remoteStoreIndexSettings(0, 1)) + .put(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING.getKey(), 0) + .build(); + createIndex(INDEX_NAME, indexSettings); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + primaryNodeName(INDEX_NAME) + ); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + int numDocs = randomIntBetween(5, 10); + for (int i = 0; i < numDocs; i++) { + keepPinnedTimestampSchedulerUpdated(); + indexSingleDoc(INDEX_NAME, true); + } + + String translogPathFixedPrefix = RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX.get(getNodeSettings()); + String shardDataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + DATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogDataPath = Path.of(translogRepoPath + "/" + shardDataPath + "/1"); + String shardMetadataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + METADATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogMetadataPath = Path.of(translogRepoPath + "/" + shardMetadataPath); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(1, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + + keepPinnedTimestampSchedulerUpdated(); + client().admin().indices().prepareDelete(INDEX_NAME).get(); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(1, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + } + + public void testIndexDeletionWithPinnedTimestamps() throws Exception { + prepareCluster(1, 1, Settings.EMPTY); + Settings indexSettings = Settings.builder() + .put(remoteStoreIndexSettings(0, 1)) + .put(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING.getKey(), 0) + .build(); + createIndex(INDEX_NAME, indexSettings); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + primaryNodeName(INDEX_NAME) + ); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + int numDocs = randomIntBetween(5, 10); + for (int i = 0; i < numDocs; i++) { + keepPinnedTimestampSchedulerUpdated(); + indexSingleDoc(INDEX_NAME, true); + if (i == 2) { + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.timeValueMinutes(1)); + remoteStorePinnedTimestampService.pinTimestamp(System.currentTimeMillis(), "xyz", noOpActionListener); + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + } + } + + String translogPathFixedPrefix = RemoteStoreSettings.CLUSTER_REMOTE_STORE_TRANSLOG_PATH_PREFIX.get(getNodeSettings()); + String shardDataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + DATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogDataPath = Path.of(translogRepoPath + "/" + shardDataPath + "/1"); + String shardMetadataPath = getShardLevelBlobPath( + client(), + INDEX_NAME, + BlobPath.cleanPath(), + "0", + TRANSLOG, + METADATA, + translogPathFixedPrefix + ).buildAsString(); + Path translogMetadataPath = Path.of(translogRepoPath + "/" + shardMetadataPath); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(2, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }, 30, TimeUnit.SECONDS); + + keepPinnedTimestampSchedulerUpdated(); + client().admin().indices().prepareDelete(INDEX_NAME).get(); + + assertBusy(() -> { + List metadataFiles = Files.list(translogMetadataPath).collect(Collectors.toList()); + assertEquals(2, metadataFiles.size()); + + verifyTranslogDataFileCount(metadataFiles, translogDataPath); + }); + } + + private void verifyTranslogDataFileCount(List metadataFiles, Path translogDataPath) throws IOException { + List mdFiles = metadataFiles.stream().map(p -> p.getFileName().toString()).collect(Collectors.toList()); + Set generations = new HashSet<>(); + for (String mdFile : mdFiles) { + Tuple minMaxGen = TranslogTransferMetadata.getMinMaxTranslogGenerationFromFilename(mdFile); + generations.addAll(LongStream.rangeClosed(minMaxGen.v1(), minMaxGen.v2()).boxed().collect(Collectors.toList())); + } + assertEquals(generations.size() * 2, Files.list(translogDataPath).collect(Collectors.toList()).size()); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/snapshots/DeleteSnapshotV2IT.java b/server/src/internalClusterTest/java/org/opensearch/snapshots/DeleteSnapshotV2IT.java index 1d7a58384c0be..5a7e536215397 100644 --- a/server/src/internalClusterTest/java/org/opensearch/snapshots/DeleteSnapshotV2IT.java +++ b/server/src/internalClusterTest/java/org/opensearch/snapshots/DeleteSnapshotV2IT.java @@ -23,8 +23,12 @@ import org.opensearch.repositories.fs.FsRepository; import org.opensearch.test.OpenSearchIntegTestCase; +import java.nio.file.Files; +import java.nio.file.NoSuchFileException; import java.nio.file.Path; +import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; import static org.hamcrest.Matchers.equalTo; @@ -36,6 +40,15 @@ public class DeleteSnapshotV2IT extends AbstractSnapshotIntegTestCase { private static final String REMOTE_REPO_NAME = "remote-store-repo-name"; + private void keepPinnedTimestampSchedulerUpdated() throws InterruptedException { + long currentTime = System.currentTimeMillis(); + int maxRetry = 10; + while (maxRetry > 0 && RemoteStorePinnedTimestampService.getPinnedTimestamps().v1() <= currentTime) { + Thread.sleep(1000); + maxRetry--; + } + } + public void testDeleteShallowCopyV2() throws Exception { disableRepoConsistencyCheck("Remote store repository is being used in the test"); @@ -206,7 +219,6 @@ public void testDeleteShallowCopyV2MultipleSnapshots() throws Exception { } - @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/15692") public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2() throws Exception { disableRepoConsistencyCheck("Remote store repository is being used in the test"); final Path remoteStoreRepoPath = randomRepoPath(); @@ -225,7 +237,6 @@ public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2() throws Exceptio RemoteStorePinnedTimestampService.class, clusterManagerName ); - remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); final String snapshotRepoName = "snapshot-repo-name"; final Path snapshotRepoPath = randomRepoPath(); @@ -234,7 +245,7 @@ public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2() throws Exceptio final String remoteStoreEnabledIndexName = "remote-index-1"; final Settings remoteStoreEnabledIndexSettings = getRemoteStoreBackedIndexSettings(); createIndex(remoteStoreEnabledIndexName, remoteStoreEnabledIndexSettings); - indexRandomDocs(remoteStoreEnabledIndexName, 5); + indexRandomDocs(remoteStoreEnabledIndexName, 25); String indexUUID = client().admin() .indices() @@ -242,11 +253,96 @@ public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2() throws Exceptio .get() .getSetting(remoteStoreEnabledIndexName, IndexMetadata.SETTING_INDEX_UUID); - String numShards = client().admin() + logger.info("--> create two remote index shallow snapshots"); + CreateSnapshotResponse createSnapshotResponse = client().admin() + .cluster() + .prepareCreateSnapshot(snapshotRepoName, "snap1") + .setWaitForCompletion(true) + .get(); + SnapshotInfo snapshotInfo1 = createSnapshotResponse.getSnapshotInfo(); + + Path indexPath = Path.of(String.valueOf(remoteStoreRepoPath), indexUUID); + Path shardPath = Path.of(String.valueOf(indexPath), "0"); + + // delete remote store index + assertAcked(client().admin().indices().prepareDelete(remoteStoreEnabledIndexName)); + + logger.info("--> delete snapshot 1"); + + Path segmentsPath = Path.of(String.valueOf(shardPath), "segments"); + Path translogPath = Path.of(String.valueOf(shardPath), "translog"); + + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + keepPinnedTimestampSchedulerUpdated(); + + AcknowledgedResponse deleteSnapshotResponse = clusterManagerClient.admin() + .cluster() + .prepareDeleteSnapshot(snapshotRepoName, snapshotInfo1.snapshotId().getName()) + .get(); + assertAcked(deleteSnapshotResponse); + + // Delete is async. Give time for it + assertBusy(() -> { + try { + assertEquals(0, RemoteStoreBaseIntegTestCase.getFileCount(segmentsPath)); + } catch (NoSuchFileException e) { + fail(); + } + }, 60, TimeUnit.SECONDS); + + assertBusy(() -> { + try { + assertEquals(0, RemoteStoreBaseIntegTestCase.getFileCount(translogPath)); + } catch (NoSuchFileException e) { + fail(); + } + }, 60, TimeUnit.SECONDS); + } + + public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2MultipleSnapshots() throws Exception { + disableRepoConsistencyCheck("Remote store repository is being used in the test"); + final Path remoteStoreRepoPath = randomRepoPath(); + Settings settings = remoteStoreClusterSettings(REMOTE_REPO_NAME, remoteStoreRepoPath); + settings = Settings.builder() + .put(settings) + .put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED.getKey(), true) + .put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_PATH_TYPE_SETTING.getKey(), RemoteStoreEnums.PathType.FIXED.toString()) + .build(); + String clusterManagerName = internalCluster().startClusterManagerOnlyNode(settings); + internalCluster().startDataOnlyNode(settings); + final Client clusterManagerClient = internalCluster().clusterManagerClient(); + ensureStableCluster(2); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + clusterManagerName + ); + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + final String snapshotRepoName = "snapshot-repo-name"; + final Path snapshotRepoPath = randomRepoPath(); + createRepository(snapshotRepoName, "mock", snapshotRepoSettingsForShallowV2(snapshotRepoPath)); + + final String remoteStoreEnabledIndexName = "remote-index-1"; + final Settings remoteStoreEnabledIndexSettings = getRemoteStoreBackedIndexSettings(); + createIndex(remoteStoreEnabledIndexName, remoteStoreEnabledIndexSettings); + indexRandomDocs(remoteStoreEnabledIndexName, 5); + + String indexUUID = client().admin() .indices() .prepareGetSettings(remoteStoreEnabledIndexName) .get() - .getSetting(remoteStoreEnabledIndexName, IndexMetadata.SETTING_NUMBER_OF_SHARDS); + .getSetting(remoteStoreEnabledIndexName, IndexMetadata.SETTING_INDEX_UUID); + + Path indexPath = Path.of(String.valueOf(remoteStoreRepoPath), indexUUID); + Path shardPath = Path.of(String.valueOf(indexPath), "0"); + + Path segmentsPath = Path.of(String.valueOf(shardPath), "segments", "data"); + Path translogPath = Path.of(String.valueOf(shardPath), "translog", "data", "1"); + logger.info("--> create two remote index shallow snapshots"); CreateSnapshotResponse createSnapshotResponse = client().admin() @@ -256,6 +352,10 @@ public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2() throws Exceptio .get(); SnapshotInfo snapshotInfo1 = createSnapshotResponse.getSnapshotInfo(); + List segmentsPostSnapshot1 = Files.list(segmentsPath).collect(Collectors.toList()); + List translogPostSnapshot1 = Files.list(translogPath).collect(Collectors.toList()); + + forceMerge(1); indexRandomDocs(remoteStoreEnabledIndexName, 25); CreateSnapshotResponse createSnapshotResponse2 = client().admin() @@ -264,64 +364,132 @@ public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2() throws Exceptio .setWaitForCompletion(true) .get(); SnapshotInfo snapshotInfo2 = createSnapshotResponse2.getSnapshotInfo(); + + List segmentsPostSnapshot2 = Files.list(segmentsPath).collect(Collectors.toList()); + List translogPostSnapshot2 = Files.list(translogPath).collect(Collectors.toList()); + assertThat(snapshotInfo2.state(), equalTo(SnapshotState.SUCCESS)); assertThat(snapshotInfo2.successfulShards(), greaterThan(0)); assertThat(snapshotInfo2.successfulShards(), equalTo(snapshotInfo2.totalShards())); assertThat(snapshotInfo2.snapshotId().getName(), equalTo("snap2")); + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + keepPinnedTimestampSchedulerUpdated(); + // delete remote store index assertAcked(client().admin().indices().prepareDelete(remoteStoreEnabledIndexName)); - logger.info("--> delete snapshot 2"); + logger.info("--> delete snapshot 1"); + RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); + keepPinnedTimestampSchedulerUpdated(); + // on snapshot deletion, remote store segment files should get cleaned up for deleted index - `remote-index-1` + AcknowledgedResponse deleteSnapshotResponse = clusterManagerClient.admin() + .cluster() + .prepareDeleteSnapshot(snapshotRepoName, snapshotInfo1.snapshotId().getName()) + .get(); + assertAcked(deleteSnapshotResponse); + + List segmentsPostDeletionOfSnapshot1 = Files.list(segmentsPath).collect(Collectors.toList()); + List translogPostDeletionOfSnapshot1 = Files.list(translogPath).collect(Collectors.toList()); + + // Delete is async. Give time for it + assertBusy(() -> assertEquals(translogPostSnapshot2.size(), translogPostDeletionOfSnapshot1.size()), 60, TimeUnit.SECONDS); + assertBusy(() -> assertEquals(segmentsPostSnapshot2.size(), segmentsPostDeletionOfSnapshot1.size()), 60, TimeUnit.SECONDS); + } + + public void testRemoteStoreCleanupForDeletedIndexForSnapshotV2MultipleSnapshots2() throws Exception { + disableRepoConsistencyCheck("Remote store repository is being used in the test"); + final Path remoteStoreRepoPath = randomRepoPath(); + Settings settings = remoteStoreClusterSettings(REMOTE_REPO_NAME, remoteStoreRepoPath); + settings = Settings.builder() + .put(settings) + .put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_PINNED_TIMESTAMP_ENABLED.getKey(), true) + .put(RemoteStoreSettings.CLUSTER_REMOTE_STORE_PATH_TYPE_SETTING.getKey(), RemoteStoreEnums.PathType.FIXED.toString()) + .build(); + String clusterManagerName = internalCluster().startClusterManagerOnlyNode(settings); + internalCluster().startDataOnlyNode(settings); + final Client clusterManagerClient = internalCluster().clusterManagerClient(); + ensureStableCluster(2); + + RemoteStorePinnedTimestampService remoteStorePinnedTimestampService = internalCluster().getInstance( + RemoteStorePinnedTimestampService.class, + clusterManagerName + ); + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + + final String snapshotRepoName = "snapshot-repo-name"; + final Path snapshotRepoPath = randomRepoPath(); + createRepository(snapshotRepoName, "mock", snapshotRepoSettingsForShallowV2(snapshotRepoPath)); + + final String remoteStoreEnabledIndexName = "remote-index-1"; + final Settings remoteStoreEnabledIndexSettings = getRemoteStoreBackedIndexSettings(); + createIndex(remoteStoreEnabledIndexName, remoteStoreEnabledIndexSettings); + indexRandomDocs(remoteStoreEnabledIndexName, 5); + + String indexUUID = client().admin() + .indices() + .prepareGetSettings(remoteStoreEnabledIndexName) + .get() + .getSetting(remoteStoreEnabledIndexName, IndexMetadata.SETTING_INDEX_UUID); Path indexPath = Path.of(String.valueOf(remoteStoreRepoPath), indexUUID); Path shardPath = Path.of(String.valueOf(indexPath), "0"); - Path segmentsPath = Path.of(String.valueOf(shardPath), "segments"); - Path translogPath = Path.of(String.valueOf(shardPath), "translog"); - // Get total segments remote store directory file count for deleted index and shard 0 - int segmentFilesCountBeforeDeletingSnapshot1 = RemoteStoreBaseIntegTestCase.getFileCount(segmentsPath); - int translogFilesCountBeforeDeletingSnapshot1 = RemoteStoreBaseIntegTestCase.getFileCount(translogPath); + Path segmentsPath = Path.of(String.valueOf(shardPath), "segments", "data"); + Path translogPath = Path.of(String.valueOf(shardPath), "translog", "data", "1"); - RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); - AcknowledgedResponse deleteSnapshotResponse = clusterManagerClient.admin() + logger.info("--> create two remote index shallow snapshots"); + CreateSnapshotResponse createSnapshotResponse = client().admin() .cluster() - .prepareDeleteSnapshot(snapshotRepoName, snapshotInfo2.snapshotId().getName()) + .prepareCreateSnapshot(snapshotRepoName, "snap1") + .setWaitForCompletion(true) .get(); - assertAcked(deleteSnapshotResponse); + SnapshotInfo snapshotInfo1 = createSnapshotResponse.getSnapshotInfo(); - Thread.sleep(5000); + List segmentsPostSnapshot1 = Files.list(segmentsPath).collect(Collectors.toList()); + List translogPostSnapshot1 = Files.list(translogPath).collect(Collectors.toList()); - assertBusy(() -> { - try { - assertThat(RemoteStoreBaseIntegTestCase.getFileCount(segmentsPath), lessThan(segmentFilesCountBeforeDeletingSnapshot1)); - } catch (Exception e) {} - }, 30, TimeUnit.SECONDS); - int segmentFilesCountAfterDeletingSnapshot1 = RemoteStoreBaseIntegTestCase.getFileCount(segmentsPath); + forceMerge(1); + indexRandomDocs(remoteStoreEnabledIndexName, 25); - logger.info("--> delete snapshot 1"); + CreateSnapshotResponse createSnapshotResponse2 = client().admin() + .cluster() + .prepareCreateSnapshot(snapshotRepoName, "snap2") + .setWaitForCompletion(true) + .get(); + SnapshotInfo snapshotInfo2 = createSnapshotResponse2.getSnapshotInfo(); + + List segmentsPostSnapshot2 = Files.list(segmentsPath).collect(Collectors.toList()); + List translogPostSnapshot2 = Files.list(translogPath).collect(Collectors.toList()); + + assertThat(snapshotInfo2.state(), equalTo(SnapshotState.SUCCESS)); + assertThat(snapshotInfo2.successfulShards(), greaterThan(0)); + assertThat(snapshotInfo2.successfulShards(), equalTo(snapshotInfo2.totalShards())); + assertThat(snapshotInfo2.snapshotId().getName(), equalTo("snap2")); + + remoteStorePinnedTimestampService.rescheduleAsyncUpdatePinnedTimestampTask(TimeValue.timeValueSeconds(1)); + keepPinnedTimestampSchedulerUpdated(); + + // delete remote store index + assertAcked(client().admin().indices().prepareDelete(remoteStoreEnabledIndexName)); + + logger.info("--> delete snapshot 2"); + keepPinnedTimestampSchedulerUpdated(); RemoteStoreSettings.setPinnedTimestampsLookbackInterval(TimeValue.ZERO); // on snapshot deletion, remote store segment files should get cleaned up for deleted index - `remote-index-1` - deleteSnapshotResponse = clusterManagerClient.admin() + AcknowledgedResponse deleteSnapshotResponse = clusterManagerClient.admin() .cluster() - .prepareDeleteSnapshot(snapshotRepoName, snapshotInfo1.snapshotId().getName()) + .prepareDeleteSnapshot(snapshotRepoName, snapshotInfo2.snapshotId().getName()) .get(); assertAcked(deleteSnapshotResponse); - // Delete is async. Give time for it - assertBusy(() -> { - try { - assertThat(RemoteStoreBaseIntegTestCase.getFileCount(segmentsPath), lessThan(segmentFilesCountAfterDeletingSnapshot1)); - } catch (Exception e) {} - }, 60, TimeUnit.SECONDS); - - assertBusy(() -> { - try { - assertThat(RemoteStoreBaseIntegTestCase.getFileCount(translogPath), lessThan(translogFilesCountBeforeDeletingSnapshot1)); - } catch (Exception e) {} - }, 60, TimeUnit.SECONDS); + List segmentsPostDeletionOfSnapshot2 = Files.list(segmentsPath).collect(Collectors.toList()); + List translogPostDeletionOfSnapshot2 = Files.list(translogPath).collect(Collectors.toList()); + // Delete is async. Give time for it + assertBusy(() -> assertEquals(translogPostSnapshot1.size(), translogPostDeletionOfSnapshot2.size()), 60, TimeUnit.SECONDS); + assertBusy(() -> assertEquals(segmentsPostSnapshot1.size(), segmentsPostDeletionOfSnapshot2.size()), 60, TimeUnit.SECONDS); } private Settings snapshotV2Settings(Path remoteStoreRepoPath) { diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java index 72bf07d4b03b2..f1c5c62aa909f 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java @@ -817,6 +817,10 @@ Set getMetadataFilesToFilterActiveSegments( return metadataFilesToFilterActiveSegments; } + public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException { + deleteStaleSegments(lastNMetadataFilesToKeep, Map.of()); + } + /** * Delete stale segment and metadata files * One metadata file is kept per commit (refresh updates the same file). To read segments uploaded to remote store, @@ -832,7 +836,7 @@ Set getMetadataFilesToFilterActiveSegments( * @param lastNMetadataFilesToKeep number of metadata files to keep * @throws IOException in case of I/O error while reading from / writing to remote segment store */ - public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException { + private void deleteStaleSegments(int lastNMetadataFilesToKeep, Map pinnedTimestampsToSkip) throws IOException { if (lastNMetadataFilesToKeep == -1) { logger.info( "Stale segment deletion is disabled if cluster.remote_store.index.segment_metadata.retention.max_count is set to -1" @@ -859,7 +863,7 @@ public void deleteStaleSegments(int lastNMetadataFilesToKeep) throws IOException return; } - Tuple> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(); + Tuple> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(pinnedTimestampsToSkip); Set implicitLockedFiles = RemoteStoreUtils.getPinnedTimestampLockedFiles( sortedMetadataFileList, @@ -994,7 +998,9 @@ public static void remoteDirectoryCleanup( String remoteStoreRepoForIndex, String indexUUID, ShardId shardId, - RemoteStorePathStrategy pathStrategy + RemoteStorePathStrategy pathStrategy, + boolean forceClean, + Map pinnedTimestampsToSkip ) { try { RemoteSegmentStoreDirectory remoteSegmentStoreDirectory = (RemoteSegmentStoreDirectory) remoteDirectoryFactory.newDirectory( @@ -1003,8 +1009,12 @@ public static void remoteDirectoryCleanup( shardId, pathStrategy ); - remoteSegmentStoreDirectory.deleteStaleSegments(0); - remoteSegmentStoreDirectory.deleteIfEmpty(); + if (forceClean) { + remoteSegmentStoreDirectory.delete(); + } else { + remoteSegmentStoreDirectory.deleteStaleSegments(0, pinnedTimestampsToSkip); + remoteSegmentStoreDirectory.deleteIfEmpty(); + } } catch (Exception e) { staticLogger.error("Exception occurred while deleting directory", e); } @@ -1023,7 +1033,10 @@ private boolean deleteIfEmpty() throws IOException { logger.info("Remote directory still has files, not deleting the path"); return false; } + return delete(); + } + private boolean delete() { try { remoteDataDirectory.delete(); remoteMetadataDirectory.delete(); diff --git a/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java b/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java index 27d34ec0d05af..ebd6d39cf85dd 100644 --- a/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java +++ b/server/src/main/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslog.java @@ -61,6 +61,7 @@ public class RemoteFsTimestampAwareTranslog extends RemoteFsTranslog { private final Map> oldFormatMetadataFileGenerationMap; private final Map> oldFormatMetadataFilePrimaryTermMap; private final AtomicLong minPrimaryTermInRemote = new AtomicLong(Long.MAX_VALUE); + private long maxDeletedGenerationOnRemote = 0; public RemoteFsTimestampAwareTranslog( TranslogConfig config, @@ -135,13 +136,20 @@ protected void trimUnreferencedReaders(boolean indexDeleted, boolean trimLocal) // This is to ensure that after the permits are acquired during primary relocation, there are no further modification on remote // store. - if (startedPrimarySupplier.getAsBoolean() == false || pauseSync.get()) { + if ((indexDeleted == false && startedPrimarySupplier.getAsBoolean() == false) || pauseSync.get()) { return; } // This is to fail fast and avoid listing md files un-necessarily. if (indexDeleted == false && RemoteStoreUtils.isPinnedTimestampStateStale()) { - logger.warn("Skipping remote segment store garbage collection as last fetch of pinned timestamp is stale"); + logger.warn("Skipping remote translog garbage collection as last fetch of pinned timestamp is stale"); + return; + } + + // This code block ensures parity with RemoteFsTranslog. Without this, we will end up making list translog metadata + // call in each invocation of trimUnreferencedReaders + long minGenerationToKeep = minRemoteGenReferenced - indexSettings().getRemoteTranslogExtraKeep(); + if (indexDeleted == false && (minGenerationToKeep <= maxDeletedGenerationOnRemote)) { return; } @@ -158,7 +166,7 @@ public void onResponse(List blobMetadata) { List metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList()); try { - if (metadataFiles.size() <= 1) { + if (indexDeleted == false && metadataFiles.size() <= 1) { logger.debug("No stale translog metadata files found"); remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS); return; @@ -166,16 +174,12 @@ public void onResponse(List blobMetadata) { // Check last fetch status of pinned timestamps. If stale, return. if (indexDeleted == false && RemoteStoreUtils.isPinnedTimestampStateStale()) { - logger.warn("Skipping remote segment store garbage collection as last fetch of pinned timestamp is stale"); + logger.warn("Skipping remote translog garbage collection as last fetch of pinned timestamp is stale"); remoteGenerationDeletionPermits.release(REMOTE_DELETION_PERMITS); return; } - List metadataFilesToBeDeleted = getMetadataFilesToBeDeleted( - metadataFiles, - metadataFilePinnedTimestampMap, - logger - ); + List metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles); // If index is not deleted, make sure to keep latest metadata file if (indexDeleted == false) { @@ -194,14 +198,13 @@ public void onResponse(List blobMetadata) { metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted); logger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted); - Set generationsToBeDeleted = getGenerationsToBeDeleted( - metadataFilesNotToBeDeleted, - metadataFilesToBeDeleted, - indexDeleted - ); + + Set generationsToBeDeleted = getGenerationsToBeDeleted(metadataFilesNotToBeDeleted, metadataFilesToBeDeleted); logger.debug(() -> "generationsToBeDeleted = " + generationsToBeDeleted); if (generationsToBeDeleted.isEmpty() == false) { + maxDeletedGenerationOnRemote = generationsToBeDeleted.stream().max(Long::compareTo).get(); + // Delete stale generations translogTransferManager.deleteGenerationAsync( primaryTermSupplier.getAsLong(), @@ -238,17 +241,8 @@ public void onFailure(Exception e) { } // Visible for testing - protected Set getGenerationsToBeDeleted( - List metadataFilesNotToBeDeleted, - List metadataFilesToBeDeleted, - boolean indexDeleted - ) throws IOException { - long maxGenerationToBeDeleted = Long.MAX_VALUE; - - if (indexDeleted == false) { - maxGenerationToBeDeleted = minRemoteGenReferenced - 1 - indexSettings().getRemoteTranslogExtraKeep(); - } - + protected Set getGenerationsToBeDeleted(List metadataFilesNotToBeDeleted, List metadataFilesToBeDeleted) + throws IOException { Set generationsFromMetadataFilesToBeDeleted = new HashSet<>(); for (String mdFile : metadataFilesToBeDeleted) { Tuple minMaxGen = getMinMaxTranslogGenerationFromMetadataFile(mdFile, translogTransferManager); @@ -262,7 +256,9 @@ protected Set getGenerationsToBeDeleted( Set generationsToBeDeleted = new HashSet<>(); for (long generation : generationsFromMetadataFilesToBeDeleted) { // Check if the generation is not referred by metadata file matching pinned timestamps - if (generation <= maxGenerationToBeDeleted && isGenerationPinned(generation, pinnedGenerations) == false) { + // The check with minRemoteGenReferenced is redundant but kept as to make sure we don't delete generations + // that are not persisted in remote segment store yet. + if (generation < minRemoteGenReferenced && isGenerationPinned(generation, pinnedGenerations) == false) { generationsToBeDeleted.add(generation); } } @@ -270,16 +266,18 @@ protected Set getGenerationsToBeDeleted( } protected List getMetadataFilesToBeDeleted(List metadataFiles) { - return getMetadataFilesToBeDeleted(metadataFiles, metadataFilePinnedTimestampMap, logger); + return getMetadataFilesToBeDeleted(metadataFiles, metadataFilePinnedTimestampMap, minRemoteGenReferenced, Map.of(), logger); } // Visible for testing protected static List getMetadataFilesToBeDeleted( List metadataFiles, Map metadataFilePinnedTimestampMap, + long minRemoteGenReferenced, + Map pinnedTimestampsToSkip, Logger logger ) { - Tuple> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(); + Tuple> pinnedTimestampsState = RemoteStorePinnedTimestampService.getPinnedTimestamps(pinnedTimestampsToSkip); // Keep files since last successful run of scheduler List metadataFilesToBeDeleted = RemoteStoreUtils.filterOutMetadataFilesBasedOnAge( @@ -312,6 +310,20 @@ protected static List getMetadataFilesToBeDeleted( metadataFilesToBeDeleted.size() ); + // Filter out metadata files based on minRemoteGenReferenced + List metadataFilesContainingMinRemoteGenReferenced = metadataFilesToBeDeleted.stream().filter(md -> { + long maxGeneration = TranslogTransferMetadata.getMaxGenerationFromFileName(md); + return maxGeneration == -1 || maxGeneration > minRemoteGenReferenced; + }).collect(Collectors.toList()); + metadataFilesToBeDeleted.removeAll(metadataFilesContainingMinRemoteGenReferenced); + + logger.trace( + "metadataFilesContainingMinRemoteGenReferenced.size = {}, metadataFilesToBeDeleted based on minRemoteGenReferenced filtering = {}, minRemoteGenReferenced = {}", + metadataFilesContainingMinRemoteGenReferenced.size(), + metadataFilesToBeDeleted.size(), + minRemoteGenReferenced + ); + return metadataFilesToBeDeleted; } @@ -472,50 +484,64 @@ protected static Tuple getMinMaxPrimaryTermFromMetadataFile( } } - public static void cleanup(TranslogTransferManager translogTransferManager) throws IOException { - ActionListener> listMetadataFilesListener = new ActionListener<>() { - @Override - public void onResponse(List blobMetadata) { - List metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList()); + public static void cleanup( + TranslogTransferManager translogTransferManager, + boolean forceClean, + Map pinnedTimestampsToSkip + ) throws IOException { + if (forceClean) { + translogTransferManager.delete(); + } else { + ActionListener> listMetadataFilesListener = new ActionListener<>() { + @Override + public void onResponse(List blobMetadata) { + List metadataFiles = blobMetadata.stream().map(BlobMetadata::name).collect(Collectors.toList()); + + try { + if (metadataFiles.isEmpty()) { + staticLogger.debug("No stale translog metadata files found"); + return; + } + List metadataFilesToBeDeleted = getMetadataFilesToBeDeleted( + metadataFiles, + new HashMap<>(), + Long.MAX_VALUE, + pinnedTimestampsToSkip, + staticLogger + ); + if (metadataFilesToBeDeleted.isEmpty()) { + staticLogger.debug("No metadata files to delete"); + return; + } + staticLogger.debug(() -> "metadataFilesToBeDeleted = " + metadataFilesToBeDeleted); - try { - if (metadataFiles.isEmpty()) { - staticLogger.debug("No stale translog metadata files found"); - return; - } - List metadataFilesToBeDeleted = getMetadataFilesToBeDeleted(metadataFiles, new HashMap<>(), staticLogger); - if (metadataFilesToBeDeleted.isEmpty()) { - staticLogger.debug("No metadata files to delete"); - return; + // For all the files that we are keeping, fetch min and max generations + List metadataFilesNotToBeDeleted = new ArrayList<>(metadataFiles); + metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted); + staticLogger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted); + + // Delete stale metadata files + translogTransferManager.deleteMetadataFilesAsync(metadataFilesToBeDeleted, () -> {}); + + // Delete stale primary terms + deleteStaleRemotePrimaryTerms( + metadataFilesNotToBeDeleted, + translogTransferManager, + new HashMap<>(), + new AtomicLong(Long.MAX_VALUE), + staticLogger + ); + } catch (Exception e) { + staticLogger.error("Exception while cleaning up metadata and primary terms", e); } - staticLogger.debug(() -> "metadataFilesToBeDeleted = " + metadataFilesToBeDeleted); + } - // For all the files that we are keeping, fetch min and max generations - List metadataFilesNotToBeDeleted = new ArrayList<>(metadataFiles); - metadataFilesNotToBeDeleted.removeAll(metadataFilesToBeDeleted); - staticLogger.debug(() -> "metadataFilesNotToBeDeleted = " + metadataFilesNotToBeDeleted); - - // Delete stale metadata files - translogTransferManager.deleteMetadataFilesAsync(metadataFilesToBeDeleted, () -> {}); - - // Delete stale primary terms - deleteStaleRemotePrimaryTerms( - metadataFilesNotToBeDeleted, - translogTransferManager, - new HashMap<>(), - new AtomicLong(Long.MAX_VALUE), - staticLogger - ); - } catch (Exception e) { + @Override + public void onFailure(Exception e) { staticLogger.error("Exception while cleaning up metadata and primary terms", e); } - } - - @Override - public void onFailure(Exception e) { - staticLogger.error("Exception while cleaning up metadata and primary terms", e); - } - }; - translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener); + }; + translogTransferManager.listTranslogMetadataFilesAsync(listMetadataFilesListener); + } } } diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferMetadata.java b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferMetadata.java index 3b8885055e8f7..7fe3305545085 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferMetadata.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferMetadata.java @@ -170,6 +170,16 @@ public static Tuple getMinMaxTranslogGenerationFromFilename(String f } } + public static long getMaxGenerationFromFileName(String filename) { + String[] tokens = filename.split(METADATA_SEPARATOR); + try { + return RemoteStoreUtils.invertLong(tokens[2]); + } catch (Exception e) { + logger.error(() -> new ParameterizedMessage("Exception while getting max generation from: {}", filename), e); + return -1; + } + } + public static Tuple getMinMaxPrimaryTermFromFilename(String filename) { String[] tokens = filename.split(METADATA_SEPARATOR); if (tokens.length < 7) { diff --git a/server/src/main/java/org/opensearch/node/remotestore/RemoteStorePinnedTimestampService.java b/server/src/main/java/org/opensearch/node/remotestore/RemoteStorePinnedTimestampService.java index 71133615ed056..6d801f9f5e50e 100644 --- a/server/src/main/java/org/opensearch/node/remotestore/RemoteStorePinnedTimestampService.java +++ b/server/src/main/java/org/opensearch/node/remotestore/RemoteStorePinnedTimestampService.java @@ -37,7 +37,6 @@ import java.util.Map; import java.util.Set; import java.util.function.Supplier; -import java.util.stream.Collectors; /** * Service for managing pinned timestamps in a remote store. @@ -48,7 +47,8 @@ @ExperimentalApi public class RemoteStorePinnedTimestampService implements Closeable { private static final Logger logger = LogManager.getLogger(RemoteStorePinnedTimestampService.class); - private static Tuple> pinnedTimestampsSet = new Tuple<>(-1L, Set.of()); + private static Tuple>> pinningEntityTimestampMap = new Tuple<>(-1L, Map.of()); + public static final String PINNED_TIMESTAMPS_PATH_TOKEN = "pinned_timestamps"; public static final String PINNED_TIMESTAMPS_FILENAME_SEPARATOR = "__"; @@ -199,21 +199,23 @@ public void cloneTimestamp(long timestamp, String existingPinningEntity, String } } - private String getBlobName(long timestamp, String pinningEntity) { + public static String getBlobName(long timestamp, String pinningEntity) { return String.join(PINNED_TIMESTAMPS_FILENAME_SEPARATOR, pinningEntity, String.valueOf(timestamp)); } - private long getTimestampFromBlobName(String blobName) { + public static Tuple getPinningEntityTimestampFromBlobName(String blobName) { String[] blobNameTokens = blobName.split(PINNED_TIMESTAMPS_FILENAME_SEPARATOR); if (blobNameTokens.length < 2) { logger.error("Pinned timestamps blob name contains invalid format: {}", blobName); } try { - return Long.parseLong(blobNameTokens[blobNameTokens.length - 1]); + String pinningEntity = blobName.substring(blobName.lastIndexOf(PINNED_TIMESTAMPS_FILENAME_SEPARATOR)); + Long timestamp = Long.parseLong(blobNameTokens[blobNameTokens.length - 1]); + return new Tuple<>(pinningEntity, timestamp); } catch (NumberFormatException e) { logger.error(() -> new ParameterizedMessage("Pinned timestamps blob name contains invalid format: {}", blobName), e); } - return -1; + return null; } /** @@ -248,14 +250,32 @@ public void close() throws IOException { // Used in integ tests public void rescheduleAsyncUpdatePinnedTimestampTask(TimeValue pinnedTimestampsSchedulerInterval) { if (pinnedTimestampsSchedulerInterval != null) { - pinnedTimestampsSet = new Tuple<>(-1L, Set.of()); + pinningEntityTimestampMap = new Tuple<>(-1L, Map.of()); asyncUpdatePinnedTimestampTask.close(); startAsyncUpdateTask(pinnedTimestampsSchedulerInterval); } } public static Tuple> getPinnedTimestamps() { - return pinnedTimestampsSet; + return getPinnedTimestamps(null); + } + + public static Tuple> getPinnedTimestamps(Map pinnedTimestampsToSkip) { + Set allPinnedTimestamps = new HashSet<>(); + if (pinnedTimestampsToSkip == null || pinnedTimestampsToSkip.isEmpty()) { + pinningEntityTimestampMap.v2().values().forEach(allPinnedTimestamps::addAll); + } else { + for (String pinningEntity : pinningEntityTimestampMap.v2().keySet()) { + if (pinnedTimestampsToSkip.containsKey(pinningEntity)) { + Set timestamps = new HashSet<>(pinningEntityTimestampMap.v2().get(pinningEntity)); + timestamps.remove(pinnedTimestampsToSkip.get(pinningEntity)); + allPinnedTimestamps.addAll(timestamps); + } else { + allPinnedTimestamps.addAll(pinningEntityTimestampMap.v2().get(pinningEntity)); + } + } + } + return new Tuple<>(pinningEntityTimestampMap.v1(), allPinnedTimestamps); } /** @@ -278,16 +298,22 @@ protected void runInternal() { try { Map pinnedTimestampList = blobContainer.listBlobs(); if (pinnedTimestampList.isEmpty()) { - pinnedTimestampsSet = new Tuple<>(triggerTimestamp, Set.of()); + logger.debug("Fetched empty pinned timestamps from remote store: {}", triggerTimestamp); + pinningEntityTimestampMap = new Tuple<>(triggerTimestamp, Map.of()); return; } - Set pinnedTimestamps = pinnedTimestampList.keySet() - .stream() - .map(RemoteStorePinnedTimestampService.this::getTimestampFromBlobName) - .filter(timestamp -> timestamp != -1) - .collect(Collectors.toSet()); + Map> pinnedTimestamps = new HashMap<>(); + for (String blobName : pinnedTimestampList.keySet()) { + Tuple pinningEntityTimestamp = getPinningEntityTimestampFromBlobName(blobName); + if (pinningEntityTimestamp != null) { + if (pinnedTimestamps.containsKey(pinningEntityTimestamp.v1()) == false) { + pinnedTimestamps.put(pinningEntityTimestamp.v1(), new HashSet<>()); + } + pinnedTimestamps.get(pinningEntityTimestamp.v1()).add(pinningEntityTimestamp.v2()); + } + } logger.debug("Fetched pinned timestamps from remote store: {} - {}", triggerTimestamp, pinnedTimestamps); - pinnedTimestampsSet = new Tuple<>(triggerTimestamp, pinnedTimestamps); + pinningEntityTimestampMap = new Tuple<>(triggerTimestamp, pinnedTimestamps); } catch (Throwable t) { logger.error("Exception while fetching pinned timestamp details", t); } diff --git a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java index b954560c1bc94..5f35a2b7d80c6 100644 --- a/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/opensearch/repositories/blobstore/BlobStoreRepository.java @@ -196,7 +196,6 @@ import static org.opensearch.index.remote.RemoteStoreEnums.PathHashAlgorithm.FNV_1A_COMPOSITE_1; import static org.opensearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo.canonicalName; import static org.opensearch.repositories.blobstore.ChecksumBlobStoreFormat.SNAPSHOT_ONLY_FORMAT_PARAMS; -import static org.opensearch.snapshots.SnapshotsService.SNAPSHOT_PINNED_TIMESTAMP_DELIMITER; /** * BlobStore - based implementation of Snapshot Repository @@ -1264,7 +1263,8 @@ private void doDeleteShardSnapshots( snapshotIds, writeShardMetaDataAndComputeDeletesStep.result(), remoteSegmentStoreDirectoryFactory, - afterCleanupsListener + afterCleanupsListener, + snapshotIdPinnedTimestampMap ); } else { asyncCleanupUnlinkedShardLevelBlobs( @@ -1283,7 +1283,8 @@ private void cleanUpRemoteStoreFilesForDeletedIndicesV2( Collection snapshotIds, Collection result, RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory, - ActionListener afterCleanupsListener + ActionListener afterCleanupsListener, + Map snapshotIdPinnedTimestampMap ) { try { Set uniqueIndexIds = new HashSet<>(); @@ -1292,7 +1293,14 @@ private void cleanUpRemoteStoreFilesForDeletedIndicesV2( } // iterate through all the indices and trigger remote store directory cleanup for deleted index segments for (String indexId : uniqueIndexIds) { - cleanRemoteStoreDirectoryIfNeeded(snapshotIds, indexId, repositoryData, remoteSegmentStoreDirectoryFactory); + cleanRemoteStoreDirectoryIfNeeded( + snapshotIds, + indexId, + repositoryData, + remoteSegmentStoreDirectoryFactory, + snapshotIdPinnedTimestampMap, + false + ); } afterCleanupsListener.onResponse(null); } catch (Exception e) { @@ -1340,7 +1348,7 @@ private void removeSnapshotPinnedTimestamp( ) { remoteStorePinnedTimestampService.unpinTimestamp( timestampToUnpin, - repository + SNAPSHOT_PINNED_TIMESTAMP_DELIMITER + snapshotId.getUUID(), + SnapshotsService.getPinningEntity(repository, snapshotId.getUUID()), new ActionListener() { @Override public void onResponse(Void unused) { @@ -1449,7 +1457,10 @@ public static void remoteDirectoryCleanupAsync( String indexUUID, ShardId shardId, String threadPoolName, - RemoteStorePathStrategy pathStrategy + RemoteStorePathStrategy pathStrategy, + boolean forceClean, + String pinningEntity, + Long pinnedTimestamp ) { threadpool.executor(threadPoolName) .execute( @@ -1459,7 +1470,9 @@ public static void remoteDirectoryCleanupAsync( remoteStoreRepoForIndex, indexUUID, shardId, - pathStrategy + pathStrategy, + forceClean, + Map.of(pinningEntity, pinnedTimestamp) ), indexUUID, shardId @@ -1515,7 +1528,10 @@ protected void releaseRemoteStoreLockAndCleanup( indexUUID, new ShardId(Index.UNKNOWN_INDEX_NAME, indexUUID, Integer.parseInt(shardId)), ThreadPool.Names.REMOTE_PURGE, - remoteStoreShardShallowCopySnapshot.getRemoteStorePathStrategy() + remoteStoreShardShallowCopySnapshot.getRemoteStorePathStrategy(), + false, + null, + null ); } } @@ -2078,7 +2094,14 @@ private void executeOneStaleIndexDelete( deleteResult = deleteResult.add(cleanUpStaleSnapshotShardPathsFile(matchingShardPaths, snapshotShardPaths)); if (remoteSegmentStoreDirectoryFactory != null) { - cleanRemoteStoreDirectoryIfNeeded(deletedSnapshots, indexSnId, oldRepoData, remoteSegmentStoreDirectoryFactory); + cleanRemoteStoreDirectoryIfNeeded( + deletedSnapshots, + indexSnId, + oldRepoData, + remoteSegmentStoreDirectoryFactory, + new HashMap<>(), + true + ); } // Finally, we delete the [base_path]/indexId folder @@ -2135,7 +2158,9 @@ private void cleanRemoteStoreDirectoryIfNeeded( Collection deletedSnapshots, String indexSnId, RepositoryData oldRepoData, - RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory + RemoteSegmentStoreDirectoryFactory remoteSegmentStoreDirectoryFactory, + Map snapshotIdPinnedTimestampMap, + boolean forceClean ) { assert (indexSnId != null); @@ -2178,6 +2203,8 @@ private void cleanRemoteStoreDirectoryIfNeeded( prevIndexMetadata ); + String pinnginEntity = SnapshotsService.getPinningEntity(getMetadata().name(), snapshotId.getUUID()); + for (int shardId = 0; shardId < prevIndexMetadata.getNumberOfShards(); shardId++) { ShardId shard = new ShardId(Index.UNKNOWN_INDEX_NAME, prevIndexMetadata.getIndexUUID(), shardId); remoteDirectoryCleanupAsync( @@ -2187,9 +2214,20 @@ private void cleanRemoteStoreDirectoryIfNeeded( prevIndexMetadata.getIndexUUID(), shard, ThreadPool.Names.REMOTE_PURGE, - remoteStorePathStrategy + remoteStorePathStrategy, + forceClean, + pinnginEntity, + snapshotIdPinnedTimestampMap.get(snapshotId) + ); + remoteTranslogCleanupAsync( + remoteTranslogRepository, + shard, + remoteStorePathStrategy, + prevIndexMetadata, + forceClean, + pinnginEntity, + snapshotIdPinnedTimestampMap.get(snapshotId) ); - remoteTranslogCleanupAsync(remoteTranslogRepository, shard, remoteStorePathStrategy, prevIndexMetadata); } } } catch (Exception e) { @@ -2213,7 +2251,10 @@ private void remoteTranslogCleanupAsync( Repository remoteTranslogRepository, ShardId shardId, RemoteStorePathStrategy remoteStorePathStrategy, - IndexMetadata prevIndexMetadata + IndexMetadata prevIndexMetadata, + boolean forceClean, + String pinningEntity, + Long pinnedTimestamp ) { assert remoteTranslogRepository instanceof BlobStoreRepository; boolean indexMetadataEnabled = RemoteStoreUtils.determineTranslogMetadataEnabled(prevIndexMetadata); @@ -2230,7 +2271,7 @@ private void remoteTranslogCleanupAsync( indexMetadataEnabled ); try { - RemoteFsTimestampAwareTranslog.cleanup(translogTransferManager); + RemoteFsTimestampAwareTranslog.cleanup(translogTransferManager, forceClean, Map.of(pinningEntity, pinnedTimestamp)); } catch (IOException e) { logger.error("Exception while cleaning up remote translog for shard: " + shardId, e); } diff --git a/server/src/main/java/org/opensearch/snapshots/SnapshotsService.java b/server/src/main/java/org/opensearch/snapshots/SnapshotsService.java index f6e550525a3e5..23f6deff3715d 100644 --- a/server/src/main/java/org/opensearch/snapshots/SnapshotsService.java +++ b/server/src/main/java/org/opensearch/snapshots/SnapshotsService.java @@ -613,7 +613,7 @@ private void updateSnapshotPinnedTimestamp( ) { remoteStorePinnedTimestampService.pinTimestamp( timestampToPin, - snapshot.getRepository() + SNAPSHOT_PINNED_TIMESTAMP_DELIMITER + snapshot.getSnapshotId().getUUID(), + getPinningEntity(snapshot.getRepository(), snapshot.getSnapshotId().getUUID()), new ActionListener() { @Override public void onResponse(Void unused) { @@ -631,6 +631,10 @@ public void onFailure(Exception e) { ); } + public static String getPinningEntity(String repositoryName, String snapshotUUID) { + return repositoryName + SNAPSHOT_PINNED_TIMESTAMP_DELIMITER + snapshotUUID; + } + private void cloneSnapshotPinnedTimestamp( RepositoryData repositoryData, SnapshotId sourceSnapshot, @@ -640,8 +644,8 @@ private void cloneSnapshotPinnedTimestamp( ) { remoteStorePinnedTimestampService.cloneTimestamp( timestampToPin, - snapshot.getRepository() + SNAPSHOT_PINNED_TIMESTAMP_DELIMITER + sourceSnapshot.getUUID(), - snapshot.getRepository() + SNAPSHOT_PINNED_TIMESTAMP_DELIMITER + snapshot.getSnapshotId().getUUID(), + getPinningEntity(snapshot.getRepository(), sourceSnapshot.getUUID()), + getPinningEntity(snapshot.getRepository(), snapshot.getSnapshotId().getUUID()), new ActionListener() { @Override public void onResponse(Void unused) { diff --git a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java index ecd6620dbea15..fe0b67c9816dc 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java @@ -565,7 +565,9 @@ public void testCleanupAsync() throws Exception { repositoryName, indexUUID, shardId, - pathStrategy + pathStrategy, + false, + Map.of() ); verify(remoteSegmentStoreDirectoryFactory).newDirectory(repositoryName, indexUUID, shardId, pathStrategy); verify(threadPool, times(0)).executor(ThreadPool.Names.REMOTE_PURGE); diff --git a/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java b/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java index 4ec68f7fb75b4..a33f85cc6f318 100644 --- a/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java +++ b/server/src/test/java/org/opensearch/index/translog/RemoteFsTimestampAwareTranslogTests.java @@ -646,8 +646,7 @@ public void testGetGenerationsToBeDeletedEmptyMetadataFilesNotToBeDeleted() thro ); Set generations = ((RemoteFsTimestampAwareTranslog) translog).getGenerationsToBeDeleted( metadataFilesNotToBeDeleted, - metadataFilesToBeDeleted, - true + metadataFilesToBeDeleted ); Set md1Generations = LongStream.rangeClosed(4, 7).boxed().collect(Collectors.toSet()); Set md2Generations = LongStream.rangeClosed(17, 37).boxed().collect(Collectors.toSet()); @@ -682,8 +681,7 @@ public void testGetGenerationsToBeDeleted() throws IOException { ); Set generations = ((RemoteFsTimestampAwareTranslog) translog).getGenerationsToBeDeleted( metadataFilesNotToBeDeleted, - metadataFilesToBeDeleted, - true + metadataFilesToBeDeleted ); Set md1Generations = LongStream.rangeClosed(5, 7).boxed().collect(Collectors.toSet()); Set md2Generations = LongStream.rangeClosed(17, 25).boxed().collect(Collectors.toSet());