From 1e92dabc12e93af3fc31c8a2d0540fa5e566089a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 16 Jul 2024 17:49:35 +0900 Subject: [PATCH] Adding logging and metrics related to shard throughputs. --- quickwit/quickwit-common/src/metrics.rs | 6 +++--- .../quickwit-config/src/node_config/mod.rs | 6 +++++- .../quickwit-ingest/src/ingest_v2/broadcast.rs | 16 ++++++++++++---- .../quickwit-ingest/src/ingest_v2/metrics.rs | 18 ++++++++++++++++-- 4 files changed, 36 insertions(+), 10 deletions(-) diff --git a/quickwit/quickwit-common/src/metrics.rs b/quickwit/quickwit-common/src/metrics.rs index da34163828c..97341d30c67 100644 --- a/quickwit/quickwit-common/src/metrics.rs +++ b/quickwit/quickwit-common/src/metrics.rs @@ -22,9 +22,9 @@ use std::sync::OnceLock; use once_cell::sync::Lazy; pub use prometheus::{ - exponential_buckets, Histogram, HistogramTimer, HistogramVec as PrometheusHistogramVec, - IntCounter, IntCounterVec as PrometheusIntCounterVec, IntGauge, - IntGaugeVec as PrometheusIntGaugeVec, + exponential_buckets, linear_buckets, Histogram, HistogramTimer, + HistogramVec as PrometheusHistogramVec, IntCounter, IntCounterVec as PrometheusIntCounterVec, + IntGauge, IntGaugeVec as PrometheusIntGaugeVec, }; use prometheus::{Gauge, HistogramOpts, Opts, TextEncoder}; diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index 86e724e0cc8..725b69d8556 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -35,7 +35,7 @@ use quickwit_common::uri::Uri; use quickwit_proto::indexing::CpuCapacity; use quickwit_proto::types::NodeId; use serde::{Deserialize, Serialize}; -use tracing::warn; +use tracing::{info, warn}; use crate::node_config::serialize::load_node_config_with_env; use crate::service::QuickwitService; @@ -325,6 +325,10 @@ impl IngestApiConfig { self.max_queue_disk_usage, self.max_queue_memory_usage ); + info!( + "ingestion shard throughput limit: {:?}", + self.shard_throughput_limit + ); ensure!( self.shard_throughput_limit >= ByteSize::mib(1) && self.shard_throughput_limit <= ByteSize::mib(20), diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs index 526a8dae81a..921fa538231 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs @@ -198,12 +198,19 @@ impl ShardThroughputTimeSeriesMap { let mut per_source_shard_infos: BTreeMap = BTreeMap::new(); for ((source_uid, shard_id), shard_time_series) in self.shard_time_series.iter() { let shard_state = shard_time_series.shard_state; - let short_term_ingestion_rate_mib_per_sec_u64 = - shard_time_series.last().as_u64() / ONE_MIB.as_u64(); + let short_term_ingestion_rate_mib_per_sec_u64: u64 = + shard_time_series.last().as_u64().div_ceil(ONE_MIB.as_u64()); + let long_term_ingestion_rate_mib_per_sec_u64: u64 = + shard_time_series.average().as_u64().div_ceil(ONE_MIB.as_u64()); + INGEST_V2_METRICS + .shard_st_throughput_mib + .observe(short_term_ingestion_rate_mib_per_sec_u64 as f64); + INGEST_V2_METRICS + .shard_lt_throughput_mib + .observe(long_term_ingestion_rate_mib_per_sec_u64 as f64); + let short_term_ingestion_rate = RateMibPerSec(short_term_ingestion_rate_mib_per_sec_u64 as u16); - let long_term_ingestion_rate_mib_per_sec_u64 = - shard_time_series.average().as_u64() / ONE_MIB.as_u64(); let long_term_ingestion_rate = RateMibPerSec(long_term_ingestion_rate_mib_per_sec_u64 as u16); let shard_info = ShardInfo { @@ -212,6 +219,7 @@ impl ShardThroughputTimeSeriesMap { short_term_ingestion_rate, long_term_ingestion_rate, }; + per_source_shard_infos .entry(source_uid.clone()) .or_default() diff --git a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs index c140f2e9d4b..f0c0ce649dc 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs @@ -20,14 +20,16 @@ use mrecordlog::ResourceUsage; use once_cell::sync::Lazy; use quickwit_common::metrics::{ - exponential_buckets, new_counter_vec, new_gauge, new_gauge_vec, new_histogram_vec, - HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, + exponential_buckets, linear_buckets, new_counter_vec, new_gauge, new_gauge_vec, new_histogram, + new_histogram_vec, Histogram, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, }; pub(super) struct IngestV2Metrics { pub reset_shards_operations_total: IntCounterVec<1>, pub open_shards: IntGauge, pub closed_shards: IntGauge, + pub shard_lt_throughput_mib: Histogram, + pub shard_st_throughput_mib: Histogram, pub wal_acquire_lock_requests_in_flight: IntGaugeVec<2>, pub wal_acquire_lock_request_duration_secs: HistogramVec<2>, pub wal_disk_used_bytes: IntGauge, @@ -56,6 +58,18 @@ impl Default for IngestV2Metrics { "ingest", &[("state", "closed")], ), + shard_lt_throughput_mib: new_histogram( + "shard_lt_throughput_mib", + "Shard long term throughput as reported through chitchat", + "ingest", + linear_buckets(0.0f64, 1.0f64, 15).unwrap(), + ), + shard_st_throughput_mib: new_histogram( + "shard_st_throughput_mib", + "Shard short term throughput as reported through chitchat", + "ingest", + linear_buckets(0.0f64, 1.0f64, 15).unwrap(), + ), wal_acquire_lock_requests_in_flight: new_gauge_vec( "wal_acquire_lock_requests_in_flight", "Number of acquire lock requests in-flight.",