From 68d638656595209020cd7525771e8aae1a4ac41e Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Wed, 2 Oct 2024 16:53:18 -0400 Subject: [PATCH 1/2] Fix flaky nodeos_snapshot_diff_test due to a race condition --- tests/nodeos_snapshot_diff_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/nodeos_snapshot_diff_test.py b/tests/nodeos_snapshot_diff_test.py index 244dc1c80e..0f8beda179 100755 --- a/tests/nodeos_snapshot_diff_test.py +++ b/tests/nodeos_snapshot_diff_test.py @@ -160,13 +160,28 @@ def waitForBlock(node, blockNum, blockType=BlockType.head, timeout=None, reportI output=cluster.getBlockLog(progNodeId, blockLogAction=BlockLogAction.trim, first=0, last=ret_head_block_num, throwException=True) nodeProg.removeState() nodeProg.rmFromCmd('--p2p-peer-address') - isRelaunchSuccess = nodeProg.relaunch(chainArg="--replay", addSwapFlags={}, timeout=relaunchTimeout) + + # There is a race condition that at the startup of node, net thread and http + # thread can start to work in different order. If http thread processes schedule_snapshot + # request after net thread starts to sync with the irrNode, schedule_snapshot + # request will miss the scheduled block number. If it is before net thread + # starts to sync with the irrNode, schedule_snapshot request will catch the + # scheduled block number and the snapshot is taken. + # + # "--allowed-connection none" is for isolating the node such that net thread + # does not initiate sync at the startup. + # "--plugin eosio::net_api_plugin" is for handling net connet request. + addSwapFlags={"--allowed-connection": "none", "--plugin": "eosio::net_api_plugin"} + + isRelaunchSuccess = nodeProg.relaunch(chainArg="--replay", addSwapFlags=addSwapFlags, timeout=relaunchTimeout) assert isRelaunchSuccess, "Failed to relaunch programmable node" Print("Schedule snapshot (node 2)") ret = nodeProg.scheduleSnapshotAt(ret_head_block_num) assert ret is not None, "Snapshot scheduling failed" + nodeProg.processUrllibRequest("net", "connect", "localhost:9877") + Print("Wait for programmable node lib to advance") waitForBlock(nodeProg, ret_head_block_num, blockType=BlockType.lib) From 8dc0d62c76bbe7e85e2224f40ded7ac0d00a9223 Mon Sep 17 00:00:00 2001 From: Lin Huang Date: Wed, 2 Oct 2024 22:10:08 -0400 Subject: [PATCH 2/2] Do not use allowed-none to isolate as it does not allow future connect --- tests/nodeos_snapshot_diff_test.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/nodeos_snapshot_diff_test.py b/tests/nodeos_snapshot_diff_test.py index 0f8beda179..aa87201690 100755 --- a/tests/nodeos_snapshot_diff_test.py +++ b/tests/nodeos_snapshot_diff_test.py @@ -155,32 +155,33 @@ def waitForBlock(node, blockNum, blockType=BlockType.head, timeout=None, reportI Utils.processSpringUtilCmd("snapshot to-json --input-file {}".format(snapshotFile), "snapshot to-json", silentErrors=False) snapshotFile = snapshotFile + ".json" - Print("Trim programmable blocklog to snapshot head block num and relaunch programmable node") - nodeProg.kill(signal.SIGTERM) - output=cluster.getBlockLog(progNodeId, blockLogAction=BlockLogAction.trim, first=0, last=ret_head_block_num, throwException=True) - nodeProg.removeState() - nodeProg.rmFromCmd('--p2p-peer-address') - # There is a race condition that at the startup of node, net thread and http # thread can start to work in different order. If http thread processes schedule_snapshot # request after net thread starts to sync with the irrNode, schedule_snapshot # request will miss the scheduled block number. If it is before net thread # starts to sync with the irrNode, schedule_snapshot request will catch the # scheduled block number and the snapshot is taken. - # - # "--allowed-connection none" is for isolating the node such that net thread - # does not initiate sync at the startup. - # "--plugin eosio::net_api_plugin" is for handling net connet request. - addSwapFlags={"--allowed-connection": "none", "--plugin": "eosio::net_api_plugin"} - isRelaunchSuccess = nodeProg.relaunch(chainArg="--replay", addSwapFlags=addSwapFlags, timeout=relaunchTimeout) + # Shut down irreversible node so that nodeProg won't sync up when starting up + Print("Kill irreversible node") + nodeIrr.kill(signal.SIGTERM) + + Print("Trim programmable blocklog to snapshot head block num and relaunch programmable node") + nodeProg.kill(signal.SIGTERM) + output=cluster.getBlockLog(progNodeId, blockLogAction=BlockLogAction.trim, first=0, last=ret_head_block_num, throwException=True) + nodeProg.removeState() + nodeProg.rmFromCmd('--p2p-peer-address') + + isRelaunchSuccess = nodeProg.relaunch(chainArg="--replay", addSwapFlags={}, timeout=relaunchTimeout) assert isRelaunchSuccess, "Failed to relaunch programmable node" Print("Schedule snapshot (node 2)") ret = nodeProg.scheduleSnapshotAt(ret_head_block_num) assert ret is not None, "Snapshot scheduling failed" - nodeProg.processUrllibRequest("net", "connect", "localhost:9877") + # Start irreversible node so that nodeProg can sync up with it + Print("Restart irreversible node") + nodeIrr.relaunch() Print("Wait for programmable node lib to advance") waitForBlock(nodeProg, ret_head_block_num, blockType=BlockType.lib)