From a06ec4dac991fe59bff9a3ca0c3bd3d769965b37 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Thu, 7 Dec 2023 17:38:41 +0300 Subject: [PATCH] Add cluster estimation --- casr/src/bin/casr-cluster.rs | 77 ++++++++++++++++++++++++++++++++++++ casr/tests/tests.rs | 16 +++++++- docs/usage.md | 2 + libcasr/src/stacktrace.rs | 55 ++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 1 deletion(-) diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 49113dc3..9cb57303 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -481,6 +481,69 @@ fn update_clusters( Ok((added, duplicates, deduplicated, result, before, after)) } +/// Calculate silhouette coefficient +/// +/// # Arguments +/// +/// * `dir` - path to directory with CASR report clusters +/// +/// * `jobs` - number of jobs for calculating process +/// +/// # Return value +/// +/// Silhouette coefficient +fn get_sil(dir: &Path, jobs: usize) -> Result { + // Get cluster dirs + let dirs: Vec = fs::read_dir(dir) + .unwrap() + .map(|path| path.unwrap().path()) + .filter(|path| { + path.clone() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("cl") + }) + .collect(); + + if dirs.len() < 2 { + bail!("{} valid cluster, nothing to calculate...", dirs.len()); + } + + // Init clusters vector + let mut clusters: Vec> = Vec::new(); + // Init casreps nuber counter + let mut size = 0usize; + // Get casreps from each cluster + for dir in &dirs { + // Get casreps from cluster + let casreps = util::get_reports(dir)?; + // Get stacktraces from cluster + let (_, stacktraces, _, _) = util::reports_from_dirs(casreps, jobs); + // Update size + size += stacktraces.len(); + // Add stacktraces + clusters.push(stacktraces); + } + // Init sil sum + let mut sum = 0f64; + // Calculate silhouette coefficient for each casrep + for i in 0..clusters.len() - 1 { + for num in 0..clusters[i].len() - 1 { + let sil = if clusters[i].len() != 1 { + let a = get_subcoef_a(num, &clusters[i]); + let b = get_subcoef_b(num, i, &clusters); + (b - a) / a.max(b) + } else { + 0f64 + }; + sum += sil; + } + } + Ok(sum / size as f64) +} + fn main() -> Result<()> { let matches = clap::Command::new("casr-cluster") .version(clap::crate_version!()) @@ -576,6 +639,14 @@ fn main() -> Result<()> { .default_value("Dist") .help("Strategy for outer cluster choosing when updating"), ) + .arg( + Arg::new("estimation") + .long("estimation") + .value_name("DIR") + .action(ArgAction::Set) + .value_parser(clap::value_parser!(PathBuf)) + .help("Make cluster estimation for DIR using silhouette index"), + ) .arg( Arg::new("ignore") .long("ignore") @@ -686,6 +757,12 @@ fn main() -> Result<()> { println!("Number of reports before crashline deduplication in new clusters: {before}"); println!("Number of reports after crashline deduplication in new clusters: {after}"); } + let sil = get_sil(paths[1], jobs)?; + println!("Cluster silhouette index: {sil}"); + } else if matches.contains_id("estimation") { + let path: &PathBuf = matches.get_one::("estimation").unwrap(); + let sil = get_sil(path, jobs)?; + println!("Cluster silhouette index: {sil}"); } Ok(()) diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index ab8b3c7b..3eebadcb 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2654,6 +2654,7 @@ fn test_casr_cluster_u() { let paths = [ abs_path("tests/casr_tests/casrep/test_clustering_small"), abs_path("tests/tmp_tests_casr/clustering_out"), + abs_path("tests/tmp_tests_casr/clustering_out/cl8/20.casrep"), abs_path("tests/tmp_tests_casr/clustering_out/cl9"), ]; @@ -2688,7 +2689,8 @@ fn test_casr_cluster_u() { assert_eq!(clusters_cnt, 9, "Clusters count mismatch."); - let _ = std::fs::remove_dir_all(&paths[2]); + let _ = std::fs::remove_file(&paths[2]); + let _ = std::fs::remove_dir_all(&paths[3]); let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) .args(["-u", &paths[0], &paths[1]]) @@ -2773,6 +2775,18 @@ fn test_casr_cluster_u() { assert_eq!(after_cnt, 1, "After count mismatch."); + let re = Regex::new(r"Cluster silhouette index: (?P\d+)").unwrap(); + let sil = re + .captures(&res) + .unwrap() + .name("sil") + .map(|x| x.as_str()) + .unwrap() + .parse::() + .unwrap(); + + assert_eq!(sil, 0, "Silhouette index mismatch."); + let _ = std::fs::remove_dir_all(&paths[1]); } diff --git a/docs/usage.md b/docs/usage.md index 5451d7d2..00764f9f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -241,6 +241,8 @@ Tool for clustering CASR reports --outer-strategy Strategy for outer cluster choosing when updating [default: Dist] [possible values: Delta, Diam, Dist] + --estimation + Make cluster estimation for DIR using silhouette index --ignore File with regular expressions for functions and file paths that should be ignored diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index f05e4fee..45f329fe 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -410,6 +410,61 @@ pub fn relation( } } +/// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "a" subcoefficient silhouette coefficient +pub fn get_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { + let mut sum = 0f64; + for i in 0..stacktraces.len() - 1 { + if i == num { + continue; + } + sum += 1.0 - similarity(&stacktraces[num], &stacktraces[i]); + } + sum / (stacktraces.len() - 1) as f64 +} + +/// Get "b" subcoefficient silhouette coefficient calculating for given stacktrace +/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition +/// +/// # Arguments +/// +/// * `num` - given stacktrace number +/// +/// * `cl` - cluster number of given stacktrace +/// +/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures +/// +/// # Return value +/// +/// "b" subcoefficient silhouette coefficient +pub fn get_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { + let mut min = MAX; + for j in 0..clusters.len() - 1 { + if j == cl { + continue; + } + let mut sum = 0f64; + for i in 0..clusters[j].len() - 1 { + sum += 1.0 - similarity(&clusters[cl][num], &clusters[j][i]); + } + let res = sum / clusters[j].len() as f64; + if res < min { + min = res; + } + } + min +} + /// Stack trace filtering trait. pub trait Filter { /// Filter frames from the stack trace that are not related to analyzed code containing crash.