Skip to content

Commit

Permalink
Add cluster estimation
Browse files Browse the repository at this point in the history
  • Loading branch information
hkctkuy authored and hkctkuy committed Dec 7, 2023
1 parent c40f319 commit a06ec4d
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 1 deletion.
77 changes: 77 additions & 0 deletions casr/src/bin/casr-cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,69 @@ fn update_clusters(
Ok((added, duplicates, deduplicated, result, before, after))
}

/// Calculate silhouette coefficient
///
/// # Arguments
///
/// * `dir` - path to directory with CASR report clusters
///
/// * `jobs` - number of jobs for calculating process
///
/// # Return value
///
/// Silhouette coefficient
fn get_sil(dir: &Path, jobs: usize) -> Result<f64> {
// Get cluster dirs
let dirs: Vec<PathBuf> = fs::read_dir(dir)
.unwrap()
.map(|path| path.unwrap().path())
.filter(|path| {
path.clone()
.file_name()
.unwrap()
.to_str()
.unwrap()
.starts_with("cl")
})
.collect();

if dirs.len() < 2 {
bail!("{} valid cluster, nothing to calculate...", dirs.len());
}

// Init clusters vector
let mut clusters: Vec<Vec<Stacktrace>> = Vec::new();
// Init casreps nuber counter
let mut size = 0usize;
// Get casreps from each cluster
for dir in &dirs {
// Get casreps from cluster
let casreps = util::get_reports(dir)?;
// Get stacktraces from cluster
let (_, stacktraces, _, _) = util::reports_from_dirs(casreps, jobs);
// Update size
size += stacktraces.len();
// Add stacktraces
clusters.push(stacktraces);
}
// Init sil sum
let mut sum = 0f64;
// Calculate silhouette coefficient for each casrep
for i in 0..clusters.len() - 1 {
for num in 0..clusters[i].len() - 1 {
let sil = if clusters[i].len() != 1 {
let a = get_subcoef_a(num, &clusters[i]);
let b = get_subcoef_b(num, i, &clusters);
(b - a) / a.max(b)
} else {
0f64
};
sum += sil;
}
}
Ok(sum / size as f64)
}

fn main() -> Result<()> {
let matches = clap::Command::new("casr-cluster")
.version(clap::crate_version!())
Expand Down Expand Up @@ -576,6 +639,14 @@ fn main() -> Result<()> {
.default_value("Dist")
.help("Strategy for outer cluster choosing when updating"),
)
.arg(
Arg::new("estimation")
.long("estimation")
.value_name("DIR")
.action(ArgAction::Set)
.value_parser(clap::value_parser!(PathBuf))
.help("Make cluster estimation for DIR using silhouette index"),
)
.arg(
Arg::new("ignore")
.long("ignore")
Expand Down Expand Up @@ -686,6 +757,12 @@ fn main() -> Result<()> {
println!("Number of reports before crashline deduplication in new clusters: {before}");
println!("Number of reports after crashline deduplication in new clusters: {after}");
}
let sil = get_sil(paths[1], jobs)?;
println!("Cluster silhouette index: {sil}");
} else if matches.contains_id("estimation") {
let path: &PathBuf = matches.get_one::<PathBuf>("estimation").unwrap();
let sil = get_sil(path, jobs)?;
println!("Cluster silhouette index: {sil}");
}

Ok(())
Expand Down
16 changes: 15 additions & 1 deletion casr/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2654,6 +2654,7 @@ fn test_casr_cluster_u() {
let paths = [
abs_path("tests/casr_tests/casrep/test_clustering_small"),
abs_path("tests/tmp_tests_casr/clustering_out"),
abs_path("tests/tmp_tests_casr/clustering_out/cl8/20.casrep"),
abs_path("tests/tmp_tests_casr/clustering_out/cl9"),
];

Expand Down Expand Up @@ -2688,7 +2689,8 @@ fn test_casr_cluster_u() {

assert_eq!(clusters_cnt, 9, "Clusters count mismatch.");

let _ = std::fs::remove_dir_all(&paths[2]);
let _ = std::fs::remove_file(&paths[2]);
let _ = std::fs::remove_dir_all(&paths[3]);

let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap())
.args(["-u", &paths[0], &paths[1]])
Expand Down Expand Up @@ -2773,6 +2775,18 @@ fn test_casr_cluster_u() {

assert_eq!(after_cnt, 1, "After count mismatch.");

let re = Regex::new(r"Cluster silhouette index: (?P<sil>\d+)").unwrap();
let sil = re
.captures(&res)
.unwrap()
.name("sil")
.map(|x| x.as_str())
.unwrap()
.parse::<u32>()
.unwrap();

assert_eq!(sil, 0, "Silhouette index mismatch.");

let _ = std::fs::remove_dir_all(&paths[1]);
}

Expand Down
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ Tool for clustering CASR reports
--outer-strategy <STRATEGY>
Strategy for outer cluster choosing when updating [default: Dist] [possible
values: Delta, Diam, Dist]
--estimation <DIR>
Make cluster estimation for DIR using silhouette index
--ignore <FILE>
File with regular expressions for functions and file paths that should be
ignored
Expand Down
55 changes: 55 additions & 0 deletions libcasr/src/stacktrace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,61 @@ pub fn relation(
}
}

/// Get "a" subcoefficient silhouette coefficient calculating for given stacktrace
/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition
///
/// # Arguments
///
/// * `num` - given stacktrace number
///
/// * `stacktraces` - cluster represented as slice of `Stacktrace` structures
///
/// # Return value
///
/// "a" subcoefficient silhouette coefficient
pub fn get_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 {
let mut sum = 0f64;
for i in 0..stacktraces.len() - 1 {
if i == num {
continue;
}
sum += 1.0 - similarity(&stacktraces[num], &stacktraces[i]);
}
sum / (stacktraces.len() - 1) as f64
}

/// Get "b" subcoefficient silhouette coefficient calculating for given stacktrace
/// Read more: https://en.wikipedia.org/wiki/Silhouette_(clustering)#Definition
///
/// # Arguments
///
/// * `num` - given stacktrace number
///
/// * `cl` - cluster number of given stacktrace
///
/// * `clusters` - a vector of clusters represented as slice of `Stacktrace` structures
///
/// # Return value
///
/// "b" subcoefficient silhouette coefficient
pub fn get_subcoef_b(num: usize, cl: usize, clusters: &[Vec<Stacktrace>]) -> f64 {
let mut min = MAX;
for j in 0..clusters.len() - 1 {
if j == cl {
continue;
}
let mut sum = 0f64;
for i in 0..clusters[j].len() - 1 {
sum += 1.0 - similarity(&clusters[cl][num], &clusters[j][i]);
}
let res = sum / clusters[j].len() as f64;
if res < min {
min = res;
}
}
min
}

/// Stack trace filtering trait.
pub trait Filter {
/// Filter frames from the stack trace that are not related to analyzed code containing crash.
Expand Down

0 comments on commit a06ec4d

Please sign in to comment.