-
Notifications
You must be signed in to change notification settings - Fork 11
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update observer to collect CPU utilization data #616
Merged
Merged
Changes from 1 commit
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
a6e3e6a
Update observer to collect CPU utilization data
blt b4723bc
PR feedback
blt ac005df
Update unit comments
blt 67af413
0.16.2-rc1
blt 87dff92
Perform utilization calculation in terms of ticks
blt 9a28083
clippy dings
blt ee286f3
adjust utilization to Agent percentage method
blt 914a20d
logical not physical cores
blt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -128,7 +128,10 @@ impl Server { | |
use std::{sync::atomic::Ordering, time::Duration}; | ||
|
||
use metrics::gauge; | ||
use procfs::Uptime; | ||
use procfs::{ | ||
process::{Limit, LimitValue}, | ||
Uptime, | ||
}; | ||
|
||
let target_pid = pid_snd | ||
.recv() | ||
|
@@ -141,13 +144,29 @@ impl Server { | |
let process = Process::new(target_pid.try_into().expect("PID coercion failed")) | ||
.map_err(Error::ProcError)?; | ||
|
||
let ticks_per_second: f64 = procfs::ticks_per_second() as f64; | ||
let limits = process.limits().map_err(Error::ProcError)?; | ||
// NOTE units on the CPU limits are 'CPU / second' | ||
let max_cpu_time: Limit = limits.max_cpu_time; | ||
let soft_cpu_limit: f64 = match max_cpu_time.soft_limit { | ||
LimitValue::Unlimited => f64::MAX, | ||
LimitValue::Value(val) => val as f64, | ||
}; | ||
let hard_cpu_limit: f64 = match max_cpu_time.hard_limit { | ||
LimitValue::Unlimited => f64::MAX, | ||
LimitValue::Value(val) => val as f64, | ||
}; | ||
|
||
let ticks_per_second: u64 = procfs::ticks_per_second(); | ||
let page_size = procfs::page_size(); | ||
|
||
gauge!("ticks_per_second", ticks_per_second); | ||
gauge!("ticks_per_second", ticks_per_second as f64); | ||
|
||
let mut procfs_delay = tokio::time::interval(Duration::from_secs(1)); | ||
|
||
let mut prev_kernel_time_seconds = 0; | ||
let mut prev_user_time_seconds = 0; | ||
let mut prev_process_uptime_seconds: f64 = 0.0; | ||
|
||
loop { | ||
tokio::select! { | ||
_ = procfs_delay.tick() => { | ||
|
@@ -156,25 +175,55 @@ impl Server { | |
// information from the kernel: computer uptime and | ||
// process starttime relative to power-on of the | ||
// computer. | ||
let process_starttime_ticks: u64 = parent_stat.starttime; | ||
let process_starttime_seconds: f64 = process_starttime_ticks as f64 / ticks_per_second; | ||
let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime; | ||
let process_uptime_seconds = uptime_seconds - process_starttime_seconds; | ||
|
||
let cutime: u64 = all_stats.iter().map(|stat| stat.0.cutime).sum::<i64>().unsigned_abs(); | ||
let cstime: u64 = all_stats.iter().map(|stat| stat.0.cstime).sum::<i64>().unsigned_abs(); | ||
let process_starttime_ticks: u64 = parent_stat.starttime; // ticks after system boot | ||
let process_starttime_seconds: f64 = (process_starttime_ticks as f64) / (ticks_per_second as f64); | ||
let uptime_seconds: f64 = Uptime::new().expect("could not query uptime").uptime; // seconds since boot | ||
let process_uptime_seconds: f64 = uptime_seconds - process_starttime_seconds; | ||
|
||
// Child process wait time | ||
let cutime: i64 = all_stats.iter().map(|stat| stat.0.cutime).sum(); | ||
let cstime: i64 = all_stats.iter().map(|stat| stat.0.cstime).sum(); | ||
// Parent process wait time | ||
let utime: u64 = all_stats.iter().map(|stat| stat.0.utime).sum(); | ||
let stime: u64 = all_stats.iter().map(|stat| stat.0.stime).sum(); | ||
|
||
let kernel_time_seconds = (cstime + stime) as f64 / ticks_per_second; | ||
let user_time_seconds = (cutime + utime) as f64 / ticks_per_second; | ||
let kernel_time_seconds: u64 = (cstime.unsigned_abs() + stime) / ticks_per_second; // CPU | ||
let user_time_seconds: u64 = (cutime.unsigned_abs() + utime) / ticks_per_second; // CPU | ||
|
||
let process_uptime_seconds_diff: f64 = process_uptime_seconds - prev_process_uptime_seconds; // second | ||
let kernel_time_seconds_diff = (kernel_time_seconds - prev_kernel_time_seconds) as f64; // CPU | ||
let user_time_seconds_diff = (user_time_seconds - prev_user_time_seconds) as f64; // CPU | ||
let time_seconds_diff = ((kernel_time_seconds + user_time_seconds) - (prev_kernel_time_seconds + prev_user_time_seconds)) as f64; // CPU | ||
blt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
let kernel_utilization_soft = (kernel_time_seconds_diff / process_uptime_seconds_diff) / soft_cpu_limit; | ||
let kernel_utilization_hard = (kernel_time_seconds_diff / process_uptime_seconds_diff) / hard_cpu_limit; | ||
let user_utilization_soft = (user_time_seconds_diff / process_uptime_seconds_diff) / soft_cpu_limit; | ||
let user_utilization_hard = (user_time_seconds_diff / process_uptime_seconds_diff) / hard_cpu_limit; | ||
let utilization_soft = (time_seconds_diff / process_uptime_seconds_diff) / soft_cpu_limit; | ||
let utilization_hard = (time_seconds_diff / process_uptime_seconds_diff) / hard_cpu_limit; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Related to my comment above asking about the units of |
||
|
||
// The time spent in kernel-space in seconds. | ||
gauge!("kernel_time_seconds", kernel_time_seconds); | ||
gauge!("kernel_time_seconds", kernel_time_seconds as f64); | ||
// The time spent in user-space in seconds. | ||
gauge!("user_time_seconds", user_time_seconds); | ||
gauge!("user_time_seconds", user_time_seconds as f64); | ||
// The uptime of the process in fractional seconds. | ||
gauge!("uptime_seconds", process_uptime_seconds); | ||
// The utilization of CPU time in kernel-space with regard to soft cgroup CPU/second limit | ||
gauge!("kernel_time_utilization_soft", kernel_utilization_soft); | ||
// The utilization of CPU time in kernel-space with regard to hard cgroup CPU/second limit | ||
gauge!("kernel_time_utilization_hard", kernel_utilization_hard); | ||
// The utilization of CPU time in user-space with regard to soft cgroup CPU/second limit | ||
gauge!("user_time_utilization_soft", user_utilization_soft); | ||
// The utilization of CPU time in user-space with regard to hard cgroup CPU/second limit | ||
gauge!("user_time_utilization_hard", user_utilization_hard); | ||
// The utilization of CPU time in user-space and kernel-space with regard to soft cgroup CPU/second limit | ||
gauge!("cpu_time_utilization_soft", utilization_soft); | ||
// The utilization of CPU time in user-space and kernel-space with regard to hard cgroup CPU/second limit | ||
gauge!("cpu_time_utilization_hard", utilization_hard); | ||
blt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
prev_kernel_time_seconds = kernel_time_seconds; | ||
prev_user_time_seconds = user_time_seconds; | ||
prev_process_uptime_seconds = process_uptime_seconds; | ||
|
||
let rss: u64 = all_stats.iter().fold(0, |val, stat| val.saturating_add(stat.0.rss)); | ||
let pss: u64 = all_stats.iter().fold(0, |val, stat| { | ||
|
@@ -189,7 +238,7 @@ impl Server { | |
let num_threads: u64 = all_stats.iter().map(|stat| stat.0.num_threads).sum::<i64>().unsigned_abs(); | ||
|
||
let rss_bytes: u64 = rss*page_size; | ||
RSS_BYTES.store(rss_bytes, Ordering::Relaxed); | ||
RSS_BYTES.store(rss_bytes, Ordering::Relaxed); // stored for the purposes of throttling | ||
|
||
// Number of pages that the process has in real memory. | ||
gauge!("rss_bytes", rss_bytes as f64); | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this value come from the
RLIMIT_CPU
field of thestruct rlimit
queried by thegetrlimit
syscall (man page)? I'm wondering if the units of this value are seconds or CPU*seconds.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's read out of
/proc/pid/limits
, see https://docs.rs/procfs/0.15.1/src/procfs/process/limit.rs.html. So yeah,RLIMIT_CPU
. "This is a limit, in seconds, on the amount of CPU time that the process can consume." Which I think would be, in the notation I'm using here, CPU units per second.