Skip to content

Commit

Permalink
Improve alarm (#1310)
Browse files Browse the repository at this point in the history
* Improve alarm

* Alarm config

* Fix alarm

* Loose alarm with missing data allowable

* Seperate westend services

* Breach alarm for production only

* Remove unused

* Change SCAN_INTERVAL to 1 hour for production

* Change Latency Threshold
  • Loading branch information
yrong authored Oct 14, 2024
1 parent eebc7ca commit 759118f
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 35 deletions.
10 changes: 6 additions & 4 deletions web/packages/api/src/status.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,17 @@ export enum AlarmReason {
AccountBalanceInsufficient = "AccountBalanceInsufficient",
ToEthereumNoTransfer = "ToEthereumNoTransfer",
ToPolkadotNoTransfer = "ToPolkadotNoTransfer",
ToEthereumChannelAttacked = "ToEthereumChannelAttacked",
ToPolkadotChannelAttacked = "ToPolkadotChannelAttacked"
}

export type Sovereign = { name: string; account: string; balance: bigint; type: SourceType }

export const BlockLatencyThreshold = {
// Syncing beefy finality update every 4 hours(2400 blocks) so we set 3000 blocks at most.
ToEthereum: 3000,
// Syncing beacon finality update every 6.4 minutes(32 blocks) so we set 128 blocks (4 epochs) at most.
ToPolkadot: 128,
// Syncing beefy finality update every 4 hours(1200 ethereum blocks), leave some buffer here
ToEthereum: 1350,
// Syncing beacon finality update every 6.4 minutes(64 substrate blocks), leave some buffer here
ToPolkadot: 80,
}

export const InsufficientBalanceThreshold = {
Expand Down
9 changes: 7 additions & 2 deletions web/packages/operations/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_REGION=eu-central-1
BRIDGE_STALE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD
BRIDGE_ATTACKED_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD
ACCOUNT_BALANCE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD-WALLET

# INFURA Key config
Expand All @@ -15,11 +16,15 @@ REACT_APP_INFURA_KEY=
GRAPHQL_API_URL=https://data.snowbridge.network/graphql

# Scan interval(in minutes)
SCAN_INTERVAL=30
SCAN_INTERVAL=60

# Keys
ETHEREUM_KEY=
SUBSTRATE_KEY=

PENPAL_TRANSFER=false
# Cron expression to run token transfer tests
CRON_EXPRESSION=0 0 * * *

# Dashboard Url
LATENCY_DASHBOARD_URL=
BALANCE_DASHBOARD_URL=
7 changes: 0 additions & 7 deletions web/packages/operations/.env.production

This file was deleted.

7 changes: 0 additions & 7 deletions web/packages/operations/.env.testnet

This file was deleted.

10 changes: 8 additions & 2 deletions web/packages/operations/ecosystem.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@ module.exports = {
args: "cron"
},
{
name: "transferToPolkadot",
name: "westend-monitor",
node_args: "--require=dotenv/config",
script: "./dist/src/main.js",
args: "cron"
},
{
name: "westend-transferToPolkadot",
node_args: "--require=dotenv/config",
script: "./dist/src/transfer_to_polkadot.js",
args: "cron"
},
{
name: "transferToEthereum",
name: "westend-transferToEthereum",
node_args: "--require=dotenv/config",
script: "./dist/src/transfer_to_ethereum.js",
args: "cron"
Expand Down
101 changes: 88 additions & 13 deletions web/packages/operations/src/alarm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import {

const CLOUD_WATCH_NAME_SPACE = "SnowbridgeMetrics"
const BRIDGE_STALE_SNS_TOPIC = process.env["BRIDGE_STALE_SNS_TOPIC"] || ""
const BRIDGE_ATTACKED_SNS_TOPIC = process.env["BRIDGE_ATTACKED_SNS_TOPIC"] || ""
const ACCOUNT_BALANCE_SNS_TOPIC = process.env["ACCOUNT_BALANCE_SNS_TOPIC"] || ""

const LatencyDashboard =
"https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Latency"
process.env["LATENCY_DASHBOARD_URL"] || "https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Latency"
const BalanceDashboard =
"https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Balance"
process.env["BALANCE_DASHBOARD_URL"] || "https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Balance"

export const sendMetrics = async (metrics: status.AllMetrics) => {
const { AlarmReason, InsufficientBalanceThreshold } = status
Expand Down Expand Up @@ -112,9 +113,14 @@ export const sendMetrics = async (metrics: status.AllMetrics) => {
metricData.push({
MetricName: AlarmReason.ToEthereumChannelStale.toString(),
Value: Number(
channel.toEthereum.outbound < channel.toEthereum.inbound ||
(channel.toEthereum.outbound > channel.toEthereum.inbound &&
channel.toEthereum.inbound <= channel.toEthereum.previousInbound)
channel.toEthereum.inbound == channel.toEthereum.previousInbound)
),
})
metricData.push({
MetricName: AlarmReason.ToEthereumChannelAttacked.toString(),
Value: Number(
channel.toEthereum.outbound < channel.toEthereum.inbound
),
})
metricData.push({
Expand Down Expand Up @@ -167,9 +173,20 @@ export const sendMetrics = async (metrics: status.AllMetrics) => {
metricData.push({
MetricName: AlarmReason.ToPolkadotChannelStale.toString(),
Value: Number(
channel.toPolkadot.outbound < channel.toPolkadot.inbound ||
(channel.toPolkadot.outbound > channel.toPolkadot.inbound &&
channel.toPolkadot.inbound <= channel.toPolkadot.previousInbound)
channel.toPolkadot.inbound == channel.toPolkadot.previousInbound)
),
})
metricData.push({
MetricName: AlarmReason.ToPolkadotChannelAttacked.toString(),
Value: Number(
channel.toPolkadot.outbound < channel.toPolkadot.inbound
),
})
metricData.push({
MetricName: AlarmReason.ToPolkadotNoTransfer.toString(),
Value: Number(
channel.toPolkadot.inbound == channel.toPolkadot.previousInbound
),
})
}
Expand Down Expand Up @@ -234,10 +251,12 @@ export const initializeAlarms = async () => {

let client = new CloudWatchClient({})
let cloudWatchAlarms = []
let alarmCommandSharedInput = {
let alarmCommandSharedInput: any = {
Namespace: CLOUD_WATCH_NAME_SPACE + "-" + name,
Threshold: 0,
TreatMissingData: "breaching",
Threshold: 0
}
if(name == "polkadot_mainnet") {
alarmCommandSharedInput.TreatMissingData = "breaching";
}

// Alarm for stale bridge
Expand All @@ -263,7 +282,7 @@ export const initializeAlarms = async () => {
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [BRIDGE_STALE_SNS_TOPIC],
EvaluationPeriods: 3,
Period: 1800,
Period: 3600,
...alarmCommandSharedInput,
})
)
Expand All @@ -289,10 +308,66 @@ export const initializeAlarms = async () => {
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [BRIDGE_STALE_SNS_TOPIC],
EvaluationPeriods: 3,
Period: 1800,
Period: 3600,
...alarmCommandSharedInput,
})
)
cloudWatchAlarms.push(
new PutMetricAlarmCommand({
AlarmName: AlarmReason.ToEthereumChannelAttacked.toString() + "-" + name,
MetricName: AlarmReason.ToEthereumChannelAttacked.toString(),
AlarmDescription: LatencyDashboard,
Statistic: "Average",
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [BRIDGE_ATTACKED_SNS_TOPIC],
EvaluationPeriods: 3,
Period: 3600,
...alarmCommandSharedInput,
})
)
cloudWatchAlarms.push(
new PutMetricAlarmCommand({
AlarmName: AlarmReason.ToPolkadotChannelAttacked.toString() + "-" + name,
MetricName: AlarmReason.ToPolkadotChannelAttacked.toString(),
AlarmDescription: LatencyDashboard,
Statistic: "Average",
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [BRIDGE_ATTACKED_SNS_TOPIC],
EvaluationPeriods: 3,
Period: 3600,
...alarmCommandSharedInput,
})
)
// For westend alarm when there is no transfer(i.e. nonce not increased) for more than 1 day
if(name == "westend_sepolia") {
cloudWatchAlarms.push(
new PutMetricAlarmCommand({
AlarmName: AlarmReason.ToEthereumNoTransfer.toString() + "-" + name,
MetricName: AlarmReason.ToEthereumNoTransfer.toString(),
AlarmDescription: LatencyDashboard,
Statistic: "Average",
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [BRIDGE_STALE_SNS_TOPIC],
EvaluationPeriods: 3,
Period: 21600,
...alarmCommandSharedInput,
})
)
cloudWatchAlarms.push(
new PutMetricAlarmCommand({
AlarmName: AlarmReason.ToPolkadotNoTransfer.toString() + "-" + name,
MetricName: AlarmReason.ToPolkadotNoTransfer.toString(),
AlarmDescription: LatencyDashboard,
Statistic: "Average",
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [BRIDGE_STALE_SNS_TOPIC],
EvaluationPeriods: 3,
Period: 21600,
...alarmCommandSharedInput,
})
)
}

for (let alarm of cloudWatchAlarms) {
await client.send(alarm)
}
Expand All @@ -305,8 +380,8 @@ export const initializeAlarms = async () => {
Statistic: "Average",
ComparisonOperator: "GreaterThanThreshold",
AlarmActions: [ACCOUNT_BALANCE_SNS_TOPIC],
EvaluationPeriods: 1,
Period: 1800,
EvaluationPeriods: 2,
Period: 3600,
...alarmCommandSharedInput,
})
await client.send(accountBalanceAlarm)
Expand Down

0 comments on commit 759118f

Please sign in to comment.