DataDog · blt · Jul 6, 2023 · Jul 5, 2023 · Jul 5, 2023 · Jul 5, 2023
@@ -128,7 +128,10 @@ impl Server {
         use std::{sync::atomic::Ordering, time::Duration};
 
         use metrics::gauge;
-        use procfs::Uptime;
+        use procfs::{
+            process::{Limit, LimitValue},
+            Uptime,
+        };
 
         let target_pid = pid_snd
             .recv()
@@ -141,13 +144,29 @@ impl Server {
         let process = Process::new(target_pid.try_into().expect("PID coercion failed"))
             .map_err(Error::ProcError)?;
 
-        let ticks_per_second: f64 = procfs::ticks_per_second() as f64;
+        let limits = process.limits().map_err(Error::ProcError)?;
+        // NOTE units on the CPU limits are 'CPU / second'
+        let max_cpu_time: Limit = limits.max_cpu_time;
+        let soft_cpu_limit: f64 = match max_cpu_time.soft_limit {
+            LimitValue::Unlimited => f64::MAX,
+            LimitValue::Value(val) => val as f64,
+        };
+        let hard_cpu_limit: f64 = match max_cpu_time.hard_limit {
+            LimitValue::Unlimited => f64::MAX,
+            LimitValue::Value(val) => val as f64,
+        };
+
+        let ticks_per_second: u64 = procfs::ticks_per_second();
         let page_size = procfs::page_size();
 
-        gauge!("ticks_per_second", ticks_per_second);
+        gauge!("ticks_per_second", ticks_per_second as f64);
 
         let mut procfs_delay = tokio::time::interval(Duration::from_secs(1));
 
+        let mut prev_kernel_time_seconds = 0;
+        let mut prev_user_time_seconds = 0;
+        let mut prev_process_uptime_seconds: f64 = 0.0;
+
         loop {
             tokio::select! {
                 _ = procfs_delay.tick() => {
@@ -156,25 +175,55 @@ impl Server {
                         // information from the kernel: computer uptime and
                         // process starttime relative to power-on of the
                         // computer.
-                        let process_starttime_ticks: u64 = parent_stat.starttime;
-                        let process_starttime_seconds: f64 = process_starttime_ticks as f64 / ticks_per_second;
-                        let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime;
-                        let process_uptime_seconds = uptime_seconds - process_starttime_seconds;
-
-                        let cutime: u64 = all_stats.iter().map(|stat| stat.0.cutime).sum::<i64>().unsigned_abs();
-                        let cstime: u64 = all_stats.iter().map(|stat| stat.0.cstime).sum::<i64>().unsigned_abs();
+                        let process_starttime_ticks: u64 = parent_stat.starttime; // ticks after system boot
+                        let process_starttime_seconds: f64 = (process_starttime_ticks as f64) / (ticks_per_second as f64);
+                        let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime; // seconds since boot
+                        let process_uptime_seconds: f64 = uptime_seconds - process_starttime_seconds;
+
+                        // Child process wait time
+                        let cutime: i64 = all_stats.iter().map(|stat| stat.0.cutime).sum();
+                        let cstime: i64 = all_stats.iter().map(|stat| stat.0.cstime).sum();
+                        // Parent process wait time
                         let utime: u64 = all_stats.iter().map(|stat| stat.0.utime).sum();
                         let stime: u64 = all_stats.iter().map(|stat| stat.0.stime).sum();
 
-                        let kernel_time_seconds = (cstime + stime) as f64 / ticks_per_second;
-                        let user_time_seconds = (cutime + utime) as f64 / ticks_per_second;
+                        let kernel_time_seconds: u64 = (cstime.unsigned_abs() + stime)  / ticks_per_second; // CPU
+                        let user_time_seconds: u64 = (cutime.unsigned_abs() + utime)  / ticks_per_second; // CPU
+
+                        let process_uptime_seconds_diff: f64 = process_uptime_seconds - prev_process_uptime_seconds; // second
+                        let kernel_time_seconds_diff = (kernel_time_seconds - prev_kernel_time_seconds) as f64; // CPU
+                        let user_time_seconds_diff = (user_time_seconds - prev_user_time_seconds) as f64; // CPU
+                        let time_seconds_diff = ((kernel_time_seconds + user_time_seconds) - (prev_kernel_time_seconds + prev_user_time_seconds)) as f64; // CPU
+
+                        let kernel_utilization_soft = (kernel_time_seconds_diff / process_uptime_seconds_diff) / soft_cpu_limit;
+                        let kernel_utilization_hard = (kernel_time_seconds_diff / process_uptime_seconds_diff) / hard_cpu_limit;
+                        let user_utilization_soft = (user_time_seconds_diff / process_uptime_seconds_diff) / soft_cpu_limit;
+                        let user_utilization_hard = (user_time_seconds_diff / process_uptime_seconds_diff) / hard_cpu_limit;
+                        let utilization_soft = (time_seconds_diff / process_uptime_seconds_diff) / soft_cpu_limit;
+                        let utilization_hard = (time_seconds_diff / process_uptime_seconds_diff) / hard_cpu_limit;
 
                         // The time spent in kernel-space in seconds.
-                        gauge!("kernel_time_seconds", kernel_time_seconds);
+                        gauge!("kernel_time_seconds", kernel_time_seconds as f64);
                         // The time spent in user-space in seconds.
-                        gauge!("user_time_seconds", user_time_seconds);
+                        gauge!("user_time_seconds", user_time_seconds as f64);
                         // The uptime of the process in fractional seconds.
                         gauge!("uptime_seconds", process_uptime_seconds);
+                        // The utilization of CPU time in kernel-space with regard to soft cgroup CPU/second limit
+                        gauge!("kernel_time_utilization_soft", kernel_utilization_soft);
+                        // The utilization of CPU time in kernel-space with regard to hard cgroup CPU/second limit
+                        gauge!("kernel_time_utilization_hard", kernel_utilization_hard);
+                        // The utilization of CPU time in user-space with regard to soft cgroup CPU/second limit
+                        gauge!("user_time_utilization_soft", user_utilization_soft);
+                        // The utilization of CPU time in user-space with regard to hard cgroup CPU/second limit
+                        gauge!("user_time_utilization_hard", user_utilization_hard);
+                        // The utilization of CPU time in user-space and kernel-space with regard to soft cgroup CPU/second limit
+                        gauge!("cpu_time_utilization_soft", utilization_soft);
+                        // The utilization of CPU time in user-space and kernel-space with regard to hard cgroup CPU/second limit
+                        gauge!("cpu_time_utilization_hard", utilization_hard);
+
+                        prev_kernel_time_seconds = kernel_time_seconds;
+                        prev_user_time_seconds = user_time_seconds;
+                        prev_process_uptime_seconds = process_uptime_seconds;
 
                         let rss: u64 = all_stats.iter().fold(0, |val, stat| val.saturating_add(stat.0.rss));
                         let pss: u64 = all_stats.iter().fold(0, |val, stat| {
@@ -189,7 +238,7 @@ impl Server {
                         let num_threads: u64 = all_stats.iter().map(|stat| stat.0.num_threads).sum::<i64>().unsigned_abs();
 
                         let rss_bytes: u64 = rss*page_size;
-                        RSS_BYTES.store(rss_bytes, Ordering::Relaxed);
+                        RSS_BYTES.store(rss_bytes, Ordering::Relaxed); // stored for the purposes of throttling
 
                         // Number of pages that the process has in real memory.
                         gauge!("rss_bytes", rss_bytes as f64);