DataDog · blt · Jul 6, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
@@ -5,8 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## Unreleased
+
+## [0.17.0-rc1]
+
 ### Changed
 - Throttle metrics are now labeled with the respective generator's labels.
+- Observer now calculates CPU utilization with respect to target cgroup hard, soft limits. 
 
 ## [0.16.1]
 ### Changed

@@ -3,7 +3,7 @@ members = ["./", "integration/sheepdog", "integration/ducks"]
 
 [package]
 name = "lading"
-version = "0.16.1"
+version = "0.17.0-rc1"
 authors = ["Brian L. Troutwine <[email protected]>"]
 edition = "2021"
 license = "MIT"
@@ -36,6 +36,7 @@ metrics-exporter-prometheus = { version = "0.12.1", default-features = false, fe
 ] }
 metrics-util = { version = "0.15" }
 nix = { version = "0.26" }
+num_cpus = { version = "1.16" }
 once_cell = "1.18"
 opentelemetry-proto = { git = "https://github.com/open-telemetry/opentelemetry-rust/", rev = "6078e32", features = [
     "traces",

@@ -122,12 +122,17 @@ impl Server {
     /// # Panics
     ///
     /// None are known.
-    #[allow(clippy::similar_names)]
+    #[allow(
+        clippy::similar_names,
+        clippy::too_many_lines,
+        clippy::cast_possible_truncation,
+        clippy::cast_sign_loss
+    )]
     #[cfg(target_os = "linux")]
     pub async fn run(mut self, mut pid_snd: TargetPidReceiver) -> Result<(), Error> {
         use std::{sync::atomic::Ordering, time::Duration};
 
-        use metrics::gauge;
+        use metrics::{gauge, register_counter, register_gauge};
         use procfs::Uptime;
 
         let target_pid = pid_snd
@@ -141,13 +146,29 @@ impl Server {
         let process = Process::new(target_pid.try_into().expect("PID coercion failed"))
             .map_err(Error::ProcError)?;
 
-        let ticks_per_second: f64 = procfs::ticks_per_second() as f64;
+        let num_cores = procfs::CpuInfo::new()
+            .map_err(Error::ProcError)?
+            .num_cores() as u64; // Cores
+
+        let ticks_per_second: u64 = procfs::ticks_per_second(); // CPU-ticks / second
         let page_size = procfs::page_size();
 
-        gauge!("ticks_per_second", ticks_per_second);
+        gauge!("core_total", num_cores as f64);
+        gauge!("ticks_per_second", ticks_per_second as f64);
 
         let mut procfs_delay = tokio::time::interval(Duration::from_secs(1));
 
+        let mut prev_kernel_time_ticks = 0;
+        let mut prev_user_time_ticks = 0;
+        let mut prev_process_uptime_ticks = 0;
+
+        let kernel_ticks_counter = register_counter!("kernel_ticks");
+        let user_ticks_counter = register_counter!("user_ticks");
+        let target_uptime_ticks_counter = register_counter!("target_uptime_ticks");
+        let cpu_utilization_gauge = register_gauge!("cpu_utilization");
+        let kernel_cpu_utilization_gauge = register_gauge!("kernel_cpu_utilization");
+        let user_cpu_utilization_gauge = register_gauge!("user_cpu_utilization");
+
         loop {
             tokio::select! {
                 _ = procfs_delay.tick() => {
@@ -156,25 +177,46 @@ impl Server {
                         // information from the kernel: computer uptime and
                         // process starttime relative to power-on of the
                         // computer.
-                        let process_starttime_ticks: u64 = parent_stat.starttime;
-                        let process_starttime_seconds: f64 = process_starttime_ticks as f64 / ticks_per_second;
-                        let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime;
-                        let process_uptime_seconds = uptime_seconds - process_starttime_seconds;
-
-                        let cutime: u64 = all_stats.iter().map(|stat| stat.0.cutime).sum::<i64>().unsigned_abs();
-                        let cstime: u64 = all_stats.iter().map(|stat| stat.0.cstime).sum::<i64>().unsigned_abs();
+                        let process_starttime_ticks: u64 = parent_stat.starttime; // ticks after system boot
+                        let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime; // seconds since boot
+                        let uptime_ticks: u64 = uptime_seconds.round() as u64 * ticks_per_second; // CPU-ticks since boot
+                        let process_uptime_ticks: u64 = uptime_ticks - process_starttime_ticks;
+
+                        // Child process wait time
+                        let cutime: i64 = all_stats.iter().map(|stat| stat.0.cutime).sum();
+                        let cstime: i64 = all_stats.iter().map(|stat| stat.0.cstime).sum();
+                        // Parent process wait time
                         let utime: u64 = all_stats.iter().map(|stat| stat.0.utime).sum();
                         let stime: u64 = all_stats.iter().map(|stat| stat.0.stime).sum();
 
-                        let kernel_time_seconds = (cstime + stime) as f64 / ticks_per_second;
-                        let user_time_seconds = (cutime + utime) as f64 / ticks_per_second;
-
-                        // The time spent in kernel-space in seconds.
-                        gauge!("kernel_time_seconds", kernel_time_seconds);
-                        // The time spent in user-space in seconds.
-                        gauge!("user_time_seconds", user_time_seconds);
-                        // The uptime of the process in fractional seconds.
-                        gauge!("uptime_seconds", process_uptime_seconds);
+                        let kernel_time_ticks: u64 = cstime.unsigned_abs() + stime; // CPU-ticks
+                        let user_time_ticks: u64 = cutime.unsigned_abs() + utime; // CPU-ticks
+
+                        let process_uptime_ticks_diff = process_uptime_ticks - prev_process_uptime_ticks; // CPU-ticks
+                        let kernel_time_ticks_diff = kernel_time_ticks - prev_kernel_time_ticks; // CPU-ticks
+                        let user_time_ticks_diff = user_time_ticks - prev_user_time_ticks; // CPU-ticks
+                        let time_ticks_diff = (kernel_time_ticks + user_time_ticks) - (prev_kernel_time_ticks + prev_user_time_ticks); // CPU-ticks
+
+                        let user_utilization = (user_time_ticks_diff * num_cores) as f64 / process_uptime_ticks_diff as f64; // Cores
+                        let kernel_utilization = (kernel_time_ticks_diff * num_cores) as f64 / process_uptime_ticks_diff as f64; // Cores
+                        let cpu_utilization = (time_ticks_diff * num_cores) as f64 / process_uptime_ticks_diff as f64; // Cores
+
+                        // The time spent in kernel-space in ticks.
+                        kernel_ticks_counter.absolute(kernel_time_ticks);
+                        // The time spent in user-space in ticks.
+                        user_ticks_counter.absolute(user_time_ticks);
+                        // The uptime of the process in CPU ticks.
+                        target_uptime_ticks_counter.absolute(process_uptime_ticks);
+                        // The utilization of available CPU cores in user and kernel space.
+                        cpu_utilization_gauge.set(cpu_utilization);
+                        // The utilization of available CPU cores in user space.
+                        user_cpu_utilization_gauge.set(user_utilization);
+                        // The utilization of available CPU cores in kernel space.
+                        kernel_cpu_utilization_gauge.set(kernel_utilization);
+
+                        prev_kernel_time_ticks = kernel_time_ticks;
+                        prev_user_time_ticks = user_time_ticks;
+                        prev_process_uptime_ticks = process_uptime_ticks;
 
                         let rss: u64 = all_stats.iter().fold(0, |val, stat| val.saturating_add(stat.0.rss));
                         let pss: u64 = all_stats.iter().fold(0, |val, stat| {
@@ -189,7 +231,7 @@ impl Server {
                         let num_threads: u64 = all_stats.iter().map(|stat| stat.0.num_threads).sum::<i64>().unsigned_abs();
 
                         let rss_bytes: u64 = rss*page_size;
-                        RSS_BYTES.store(rss_bytes, Ordering::Relaxed);
+                        RSS_BYTES.store(rss_bytes, Ordering::Relaxed); // stored for the purposes of throttling
 
                         // Number of pages that the process has in real memory.
                         gauge!("rss_bytes", rss_bytes as f64);