Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update observer to collect CPU utilization data #616

Merged
merged 8 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

## [0.17.0-rc1]

### Changed
- Throttle metrics are now labeled with the respective generator's labels.
- Observer now calculates CPU utilization with respect to target cgroup hard, soft limits.

## [0.16.1]
### Changed
Expand Down
18 changes: 5 additions & 13 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ members = ["./", "integration/sheepdog", "integration/ducks"]

[package]
name = "lading"
version = "0.16.1"
version = "0.17.0-rc1"
authors = ["Brian L. Troutwine <[email protected]>"]
edition = "2021"
license = "MIT"
Expand Down Expand Up @@ -36,6 +36,7 @@ metrics-exporter-prometheus = { version = "0.12.1", default-features = false, fe
] }
metrics-util = { version = "0.15" }
nix = { version = "0.26" }
num_cpus = { version = "1.16" }
once_cell = "1.18"
opentelemetry-proto = { git = "https://github.com/open-telemetry/opentelemetry-rust/", rev = "6078e32", features = [
"traces",
Expand Down
84 changes: 63 additions & 21 deletions src/observer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,17 @@ impl Server {
/// # Panics
///
/// None are known.
#[allow(clippy::similar_names)]
#[allow(
clippy::similar_names,
clippy::too_many_lines,
clippy::cast_possible_truncation,
clippy::cast_sign_loss
)]
#[cfg(target_os = "linux")]
pub async fn run(mut self, mut pid_snd: TargetPidReceiver) -> Result<(), Error> {
use std::{sync::atomic::Ordering, time::Duration};

use metrics::gauge;
use metrics::{gauge, register_counter, register_gauge};
use procfs::Uptime;

let target_pid = pid_snd
Expand All @@ -141,13 +146,29 @@ impl Server {
let process = Process::new(target_pid.try_into().expect("PID coercion failed"))
.map_err(Error::ProcError)?;

let ticks_per_second: f64 = procfs::ticks_per_second() as f64;
let num_cores = procfs::CpuInfo::new()
.map_err(Error::ProcError)?
.num_cores() as u64; // Cores

let ticks_per_second: u64 = procfs::ticks_per_second(); // CPU-ticks / second
let page_size = procfs::page_size();

gauge!("ticks_per_second", ticks_per_second);
gauge!("core_total", num_cores as f64);
gauge!("ticks_per_second", ticks_per_second as f64);

let mut procfs_delay = tokio::time::interval(Duration::from_secs(1));

let mut prev_kernel_time_ticks = 0;
let mut prev_user_time_ticks = 0;
let mut prev_process_uptime_ticks = 0;

let kernel_ticks_counter = register_counter!("kernel_ticks");
let user_ticks_counter = register_counter!("user_ticks");
let target_uptime_ticks_counter = register_counter!("target_uptime_ticks");
let cpu_utilization_gauge = register_gauge!("cpu_utilization");
let kernel_cpu_utilization_gauge = register_gauge!("kernel_cpu_utilization");
let user_cpu_utilization_gauge = register_gauge!("user_cpu_utilization");

loop {
tokio::select! {
_ = procfs_delay.tick() => {
Expand All @@ -156,25 +177,46 @@ impl Server {
// information from the kernel: computer uptime and
// process starttime relative to power-on of the
// computer.
let process_starttime_ticks: u64 = parent_stat.starttime;
let process_starttime_seconds: f64 = process_starttime_ticks as f64 / ticks_per_second;
let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime;
let process_uptime_seconds = uptime_seconds - process_starttime_seconds;

let cutime: u64 = all_stats.iter().map(|stat| stat.0.cutime).sum::<i64>().unsigned_abs();
let cstime: u64 = all_stats.iter().map(|stat| stat.0.cstime).sum::<i64>().unsigned_abs();
let process_starttime_ticks: u64 = parent_stat.starttime; // ticks after system boot
let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime; // seconds since boot
let uptime_ticks: u64 = uptime_seconds.round() as u64 * ticks_per_second; // CPU-ticks since boot
let process_uptime_ticks: u64 = uptime_ticks - process_starttime_ticks;

// Child process wait time
let cutime: i64 = all_stats.iter().map(|stat| stat.0.cutime).sum();
let cstime: i64 = all_stats.iter().map(|stat| stat.0.cstime).sum();
// Parent process wait time
let utime: u64 = all_stats.iter().map(|stat| stat.0.utime).sum();
let stime: u64 = all_stats.iter().map(|stat| stat.0.stime).sum();

let kernel_time_seconds = (cstime + stime) as f64 / ticks_per_second;
let user_time_seconds = (cutime + utime) as f64 / ticks_per_second;

// The time spent in kernel-space in seconds.
gauge!("kernel_time_seconds", kernel_time_seconds);
// The time spent in user-space in seconds.
gauge!("user_time_seconds", user_time_seconds);
// The uptime of the process in fractional seconds.
gauge!("uptime_seconds", process_uptime_seconds);
let kernel_time_ticks: u64 = cstime.unsigned_abs() + stime; // CPU-ticks
let user_time_ticks: u64 = cutime.unsigned_abs() + utime; // CPU-ticks

let process_uptime_ticks_diff = process_uptime_ticks - prev_process_uptime_ticks; // CPU-ticks
let kernel_time_ticks_diff = kernel_time_ticks - prev_kernel_time_ticks; // CPU-ticks
let user_time_ticks_diff = user_time_ticks - prev_user_time_ticks; // CPU-ticks
let time_ticks_diff = (kernel_time_ticks + user_time_ticks) - (prev_kernel_time_ticks + prev_user_time_ticks); // CPU-ticks

let user_utilization = (user_time_ticks_diff * num_cores) as f64 / process_uptime_ticks_diff as f64; // Cores
let kernel_utilization = (kernel_time_ticks_diff * num_cores) as f64 / process_uptime_ticks_diff as f64; // Cores
let cpu_utilization = (time_ticks_diff * num_cores) as f64 / process_uptime_ticks_diff as f64; // Cores

// The time spent in kernel-space in ticks.
kernel_ticks_counter.absolute(kernel_time_ticks);
// The time spent in user-space in ticks.
user_ticks_counter.absolute(user_time_ticks);
// The uptime of the process in CPU ticks.
target_uptime_ticks_counter.absolute(process_uptime_ticks);
// The utilization of available CPU cores in user and kernel space.
cpu_utilization_gauge.set(cpu_utilization);
// The utilization of available CPU cores in user space.
user_cpu_utilization_gauge.set(user_utilization);
// The utilization of available CPU cores in kernel space.
kernel_cpu_utilization_gauge.set(kernel_utilization);

prev_kernel_time_ticks = kernel_time_ticks;
prev_user_time_ticks = user_time_ticks;
prev_process_uptime_ticks = process_uptime_ticks;

let rss: u64 = all_stats.iter().fold(0, |val, stat| val.saturating_add(stat.0.rss));
let pss: u64 = all_stats.iter().fold(0, |val, stat| {
Expand All @@ -189,7 +231,7 @@ impl Server {
let num_threads: u64 = all_stats.iter().map(|stat| stat.0.num_threads).sum::<i64>().unsigned_abs();

let rss_bytes: u64 = rss*page_size;
RSS_BYTES.store(rss_bytes, Ordering::Relaxed);
RSS_BYTES.store(rss_bytes, Ordering::Relaxed); // stored for the purposes of throttling

// Number of pages that the process has in real memory.
gauge!("rss_bytes", rss_bytes as f64);
Expand Down
Loading