From 3e024116687ae03370f51378bab288d06c7b423b Mon Sep 17 00:00:00 2001 From: Sooraj Sinha Date: Mon, 16 Sep 2024 12:18:32 +0530 Subject: [PATCH] Retry remote state download while bootstrap Signed-off-by: Sooraj Sinha --- .../opensearch/gateway/GatewayMetaState.java | 56 +++++++++++-- .../GatewayMetaStatePersistedStateTests.java | 84 ++++++++++++++++++- 2 files changed, 127 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java b/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java index b3836edcd7d6c..13e478363e7c1 100644 --- a/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java +++ b/server/src/main/java/org/opensearch/gateway/GatewayMetaState.java @@ -66,7 +66,6 @@ import org.opensearch.gateway.remote.RemoteClusterStateService; import org.opensearch.gateway.remote.model.RemoteClusterStateManifestInfo; import org.opensearch.index.recovery.RemoteStoreRestoreService; -import org.opensearch.index.recovery.RemoteStoreRestoreService.RemoteRestoreResult; import org.opensearch.node.Node; import org.opensearch.plugins.MetadataUpgrader; import org.opensearch.threadpool.ThreadPool; @@ -175,15 +174,11 @@ public void start( ); if (ClusterState.UNKNOWN_UUID.equals(lastKnownClusterUUID) == false) { // Load state from remote - final RemoteRestoreResult remoteRestoreResult = remoteStoreRestoreService.restore( - // Remote Metadata should always override local disk Metadata - // if local disk Metadata's cluster uuid is UNKNOWN_UUID - ClusterState.builder(clusterState).metadata(Metadata.EMPTY_METADATA).build(), - lastKnownClusterUUID, - false, - new String[] {} + clusterState = restoreClusterStateWithRetries( + remoteStoreRestoreService, + clusterState, + lastKnownClusterUUID ); - clusterState = remoteRestoreResult.getClusterState(); } } remotePersistedState = new RemotePersistedState(remoteClusterStateService, lastKnownClusterUUID); @@ -258,6 +253,49 @@ public void start( } } + private ClusterState restoreClusterStateWithRetries( + RemoteStoreRestoreService remoteStoreRestoreService, + ClusterState clusterState, + String lastKnownClusterUUID + ) { + int maxAttempts = 5; + int delayInMills = 100; + for (int attempt = 1; attempt <= maxAttempts; attempt++) { + try { + return restoreClusterState(remoteStoreRestoreService, clusterState, lastKnownClusterUUID); + } catch (Exception e) { + if (attempt == maxAttempts) { + // Throw an Error so that the process is halted. + throw new Error("Failed to load metadata", e); + } + try { + TimeUnit.MILLISECONDS.sleep(delayInMills); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); // Restore interrupted status + throw new RuntimeException(ie); + } + delayInMills = delayInMills * 2; + } + } + // This statement will never be reached. + return null; + } + + ClusterState restoreClusterState( + RemoteStoreRestoreService remoteStoreRestoreService, + ClusterState clusterState, + String lastKnownClusterUUID + ) { + return remoteStoreRestoreService.restore( + // Remote Metadata should always override local disk Metadata + // if local disk Metadata's cluster uuid is UNKNOWN_UUID + ClusterState.builder(clusterState).metadata(Metadata.EMPTY_METADATA).build(), + lastKnownClusterUUID, + false, + new String[] {} + ).getClusterState(); + } + // exposed so it can be overridden by tests ClusterState prepareInitialClusterState(TransportService transportService, ClusterService clusterService, ClusterState clusterState) { assert clusterState.nodes().getLocalNode() == null : "prepareInitialClusterState must only be called once"; diff --git a/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java b/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java index 5ea5241762753..efdb3076f419c 100644 --- a/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java +++ b/server/src/test/java/org/opensearch/gateway/GatewayMetaStatePersistedStateTests.java @@ -1244,14 +1244,72 @@ public void testGatewayForRemoteStateForInitialBootstrapBlocksApplied() throws I } } - private MockGatewayMetaState newGatewayForRemoteState( + public void testGatewayMetaStateRemoteStateDownloadRetries() throws IOException { + MockGatewayMetaState gateway = null; + MockGatewayMetaState gatewayMetaStateSpy = null; + try { + RemoteClusterStateService remoteClusterStateService = mock(RemoteClusterStateService.class); + when(remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster")).thenReturn("test-cluster-uuid"); + RemoteStoreRestoreService remoteStoreRestoreService = mock(RemoteStoreRestoreService.class); + when(remoteStoreRestoreService.restore(any(), any(), anyBoolean(), any())).thenThrow( + new IllegalStateException("unable to download cluster state") + ).thenReturn(RemoteRestoreResult.build("test-cluster-uuid", null, ClusterState.EMPTY_STATE)); + final PersistedStateRegistry persistedStateRegistry = persistedStateRegistry(); + gateway = initializeGatewayForRemoteState(true); + gatewayMetaStateSpy = Mockito.spy(gateway); + startGatewayForRemoteState( + gatewayMetaStateSpy, + remoteClusterStateService, + remoteStoreRestoreService, + persistedStateRegistry, + ClusterState.EMPTY_STATE + ); + verify(gatewayMetaStateSpy, times(2)).restoreClusterState(Mockito.any(), Mockito.any(), Mockito.any()); + } finally { + IOUtils.close(gatewayMetaStateSpy); + } + } + + public void testGatewayMetaStateRemoteStateDownloadFailure() throws IOException { + MockGatewayMetaState gateway = null; + final MockGatewayMetaState gatewayMetaStateSpy; + try { + RemoteClusterStateService remoteClusterStateService = mock(RemoteClusterStateService.class); + when(remoteClusterStateService.getLastKnownUUIDFromRemote("test-cluster")).thenReturn("test-cluster-uuid"); + RemoteStoreRestoreService remoteStoreRestoreService = mock(RemoteStoreRestoreService.class); + when(remoteStoreRestoreService.restore(any(), any(), anyBoolean(), any())).thenThrow( + new IllegalStateException("unable to download cluster state") + ); + final PersistedStateRegistry persistedStateRegistry = persistedStateRegistry(); + gateway = initializeGatewayForRemoteState(true); + gatewayMetaStateSpy = Mockito.spy(gateway); + assertThrows( + Error.class, + () -> startGatewayForRemoteState( + gatewayMetaStateSpy, + remoteClusterStateService, + remoteStoreRestoreService, + persistedStateRegistry, + ClusterState.EMPTY_STATE + ) + ); + verify(gatewayMetaStateSpy, times(5)).restoreClusterState(Mockito.any(), Mockito.any(), Mockito.any()); + } finally { + IOUtils.close(gateway); + } + } + + private MockGatewayMetaState initializeGatewayForRemoteState(boolean prepareFullState) { + return new MockGatewayMetaState(localNode, bigArrays, prepareFullState); + } + + private MockGatewayMetaState startGatewayForRemoteState( + MockGatewayMetaState gateway, RemoteClusterStateService remoteClusterStateService, RemoteStoreRestoreService remoteStoreRestoreService, PersistedStateRegistry persistedStateRegistry, - ClusterState currentState, - boolean prepareFullState + ClusterState currentState ) throws IOException { - MockGatewayMetaState gateway = new MockGatewayMetaState(localNode, bigArrays, prepareFullState); String randomRepoName = "randomRepoName"; String stateRepoTypeAttributeKey = String.format( Locale.getDefault(), @@ -1305,6 +1363,24 @@ private MockGatewayMetaState newGatewayForRemoteState( return gateway; } + private MockGatewayMetaState newGatewayForRemoteState( + RemoteClusterStateService remoteClusterStateService, + RemoteStoreRestoreService remoteStoreRestoreService, + PersistedStateRegistry persistedStateRegistry, + ClusterState currentState, + boolean prepareFullState + ) throws IOException { + MockGatewayMetaState gatewayMetaState = initializeGatewayForRemoteState(prepareFullState); + startGatewayForRemoteState( + gatewayMetaState, + remoteClusterStateService, + remoteStoreRestoreService, + persistedStateRegistry, + currentState + ); + return gatewayMetaState; + } + private static BigArrays getBigArrays() { return usually() ? BigArrays.NON_RECYCLING_INSTANCE