From b80750d76d646e7a2a9778fd41e4e0dabd371672 Mon Sep 17 00:00:00 2001 From: hkctkuy Date: Fri, 8 Dec 2023 20:46:50 +0300 Subject: [PATCH] Fix sil --- casr/src/bin/casr-cluster.rs | 21 +++-- .../casrep/test_clustering_small/40.casrep | 87 +++++++++++++++++++ casr/tests/tests.rs | 52 +++++++++-- libcasr/src/stacktrace.rs | 7 +- 4 files changed, 145 insertions(+), 22 deletions(-) create mode 100644 casr/tests/casr_tests/casrep/test_clustering_small/40.casrep diff --git a/casr/src/bin/casr-cluster.rs b/casr/src/bin/casr-cluster.rs index 37c687b4..cc879f87 100644 --- a/casr/src/bin/casr-cluster.rs +++ b/casr/src/bin/casr-cluster.rs @@ -338,7 +338,7 @@ fn update_clusters( .zip(stacktraces.iter().zip(crashlines.iter())); // Get casreps from existing clusters - let cluster_dirs: Vec = fs::read_dir(oldpath) + let mut cluster_dirs: Vec = fs::read_dir(oldpath) .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { @@ -350,6 +350,7 @@ fn update_clusters( .starts_with("cl") }) .collect(); + cluster_dirs.sort(); let len = cluster_dirs.len(); // Init clusters vector let mut clusters: Vec = Vec::new(); @@ -490,7 +491,7 @@ fn update_clusters( /// Silhouette coefficient fn get_sil(dir: &Path, jobs: usize) -> Result { // Get cluster dirs - let dirs: Vec = fs::read_dir(dir) + let mut dirs: Vec = fs::read_dir(dir) .unwrap() .map(|path| path.unwrap().path()) .filter(|path| { @@ -502,6 +503,7 @@ fn get_sil(dir: &Path, jobs: usize) -> Result { .starts_with("cl") }) .collect(); + dirs.sort(); if dirs.len() < 2 { bail!("{} valid cluster, nothing to calculate...", dirs.len()); @@ -525,9 +527,10 @@ fn get_sil(dir: &Path, jobs: usize) -> Result { // Init sil sum let mut sum = 0f64; // Calculate silhouette coefficient for each casrep - for i in 0..clusters.len() - 1 { - for num in 0..clusters[i].len() - 1 { - sum += sil_coef(num, i, &clusters); + for i in 0..clusters.len() { + for num in 0..clusters[i].len() { + let sil = sil_coef(num, i, &clusters); + sum += sil; } } Ok(sum / size as f64) @@ -629,8 +632,8 @@ fn main() -> Result<()> { .help("Strategy for outer cluster choosing when updating"), ) .arg( - Arg::new("estimation") - .long("estimation") + Arg::new("estimate") + .long("estimate") .value_name("DIR") .action(ArgAction::Set) .value_parser(clap::value_parser!(PathBuf)) @@ -748,8 +751,8 @@ fn main() -> Result<()> { } let sil = get_sil(paths[1], jobs)?; println!("Cluster silhouette index: {sil}"); - } else if matches.contains_id("estimation") { - let path: &PathBuf = matches.get_one::("estimation").unwrap(); + } else if matches.contains_id("estimate") { + let path: &PathBuf = matches.get_one::("estimate").unwrap(); let sil = get_sil(path, jobs)?; println!("Cluster silhouette index: {sil}"); } diff --git a/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep b/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep new file mode 100644 index 00000000..ea43e532 --- /dev/null +++ b/casr/tests/casr_tests/casrep/test_clustering_small/40.casrep @@ -0,0 +1,87 @@ +{ + "Date": "2021-07-14T19:56:09.276635+03:00", + "Uname": "Linux titanfall 5.8.0-59-generic #66~20.04.1-Ubuntu SMP Thu Jun 17 11:14:10 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux", + "OS": "Ubuntu", + "OSRelease": "20.04", + "Architecture": "amd64", + "ExecutablePath": "/usr/local/bin/tiff2pdf", + "ProcCmdline": "tiff2pdf ./fuz3tiff2pdf/main/crashes/id:000009,sig:06,src:000040+000049,time:43718,op:splice,rep:4", + "ProcMaps": [ + " 0x555555554000 0x555555556000 0x2000 0x0 /usr/local/bin/tiff2pdf", + " 0x555555556000 0x555555561000 0xb000 0x2000 /usr/local/bin/tiff2pdf", + " 0x555555561000 0x555555565000 0x4000 0xd000 /usr/local/bin/tiff2pdf", + " 0x555555565000 0x555555566000 0x1000 0x10000 /usr/local/bin/tiff2pdf", + " 0x555555566000 0x555555567000 0x1000 0x11000 /usr/local/bin/tiff2pdf", + " 0x555555567000 0x555555588000 0x21000 0x0 [heap]", + " 0x7ffff7945000 0x7ffff7949000 0x4000 0x0 ", + " 0x7ffff7949000 0x7ffff7958000 0xf000 0x0 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7958000 0x7ffff79ff000 0xa7000 0xf000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff79ff000 0x7ffff7a96000 0x97000 0xb6000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a96000 0x7ffff7a97000 0x1000 0x14c000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a97000 0x7ffff7a98000 0x1000 0x14d000 /usr/lib/x86_64-linux-gnu/libm-2.31.so", + " 0x7ffff7a98000 0x7ffff7a9a000 0x2000 0x0 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7a9a000 0x7ffff7aab000 0x11000 0x2000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7aab000 0x7ffff7ab1000 0x6000 0x13000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab1000 0x7ffff7ab2000 0x1000 0x19000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab2000 0x7ffff7ab3000 0x1000 0x19000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab3000 0x7ffff7ab4000 0x1000 0x1a000 /usr/lib/x86_64-linux-gnu/libz.so.1.2.11", + " 0x7ffff7ab4000 0x7ffff7ab8000 0x4000 0x0 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7ab8000 0x7ffff7afc000 0x44000 0x4000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7afc000 0x7ffff7b36000 0x3a000 0x48000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b36000 0x7ffff7b37000 0x1000 0x82000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b37000 0x7ffff7b38000 0x1000 0x82000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b38000 0x7ffff7b39000 0x1000 0x83000 /usr/lib/x86_64-linux-gnu/libjpeg.so.8.2.2", + " 0x7ffff7b39000 0x7ffff7b44000 0xb000 0x0 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7b44000 0x7ffff7d43000 0x1ff000 0xb000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d43000 0x7ffff7d44000 0x1000 0xa000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d44000 0x7ffff7d47000 0x3000 0xb000 /usr/lib/x86_64-linux-gnu/libjbig.so.0", + " 0x7ffff7d47000 0x7ffff7d6c000 0x25000 0x0 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7d6c000 0x7ffff7ee4000 0x178000 0x25000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7ee4000 0x7ffff7f2e000 0x4a000 0x19d000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f2e000 0x7ffff7f2f000 0x1000 0x1e7000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f2f000 0x7ffff7f32000 0x3000 0x1e7000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f32000 0x7ffff7f35000 0x3000 0x1ea000 /usr/lib/x86_64-linux-gnu/libc-2.31.so", + " 0x7ffff7f35000 0x7ffff7f39000 0x4000 0x0 ", + " 0x7ffff7f39000 0x7ffff7f41000 0x8000 0x0 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f41000 0x7ffff7f76000 0x35000 0x8000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f76000 0x7ffff7f9f000 0x29000 0x3d000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7f9f000 0x7ffff7fa0000 0x1000 0x66000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa0000 0x7ffff7fa2000 0x2000 0x66000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa2000 0x7ffff7fa3000 0x1000 0x68000 /usr/local/lib/libtiff.so.3.9.6", + " 0x7ffff7fa3000 0x7ffff7fa5000 0x2000 0x0 ", + " 0x7ffff7fc8000 0x7ffff7fc9000 0x1000 0x0 ", + " 0x7ffff7fc9000 0x7ffff7fcd000 0x4000 0x0 [vvar]", + " 0x7ffff7fcd000 0x7ffff7fcf000 0x2000 0x0 [vdso]", + " 0x7ffff7fcf000 0x7ffff7fd0000 0x1000 0x0 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7fd0000 0x7ffff7ff3000 0x23000 0x1000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ff3000 0x7ffff7ffb000 0x8000 0x24000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffb000 0x7ffff7ffc000 0x1000 0x0 /home/avgor46/testdoc/fuz3tiff2pdf/main/crashes/id:000009,sig:06,src:000040+000049,time:43718,op:splice,rep:4", + " 0x7ffff7ffc000 0x7ffff7ffd000 0x1000 0x2c000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffd000 0x7ffff7ffe000 0x1000 0x2d000 /usr/lib/x86_64-linux-gnu/ld-2.31.so", + " 0x7ffff7ffe000 0x7ffff7fff000 0x1000 0x0 ", + " 0x7ffffffde000 0x7ffffffff000 0x21000 0x0 [stack]", + " 0xffffffffff600000 0xffffffffff601000 0x1000 0x0 [vsyscall]" + ], + "CrashSeverity": { + "Type": "NOT_CRITICAL", + "ShortDescription": "SafeFunctionCheck", + "Description": "Buffer overflow in safe function", + "Explanation": "The target stopped while handling a signal that was generated by libc due to detection of buffer overflow in safe copy function." + }, + "Stacktrace": [ + "#0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50", + "#1 0x00007ffff7d6c859 in __GI_abort () at abort.c:79", + "#2 0x00007ffff7dd73ee in __libc_message (action=action@entry=do_abort, fmt=fmt@entry=0x7ffff7f0107c \"*** %s ***: terminated\\n\") at ../sysdeps/posix/libc_fatal.c:155", + "#3 0x00007ffff7e79b4a in __GI___fortify_fail (msg=msg@entry=0x7ffff7f01012 \"buffer overflow detected\") at fortify_fail.c:26", + "#4 0x00007ffff7e783e6 in __GI___chk_fail () at chk_fail.c:28", + "#5 0x00007ffff7dcf1cf in _IO_str_chk_overflow (fp=, c=) at iovsprintf.c:35", + "#6 0x00007ffff7da7db0 in __GI___printf_fp_l (fp=, loc=, info=, args=) at printf_fp.c:1246", + "#7 0x00007ffff7dc163a in __vfprintf_internal (s=s@entry=0x7fffffffe070, format=format@entry=0x5555555613df \"%.4f\", ap=ap@entry=0x7fffffffe1b0, mode_flags=mode_flags@entry=6) at vfprintf-internal.c:1687", + "#8 0x00007ffff7dcf279 in __vsprintf_internal (string=0x7fffffffe2a0 \"79725330432.000\", maxlen=, format=0x5555555613df \"%.4f\", args=args@entry=0x7fffffffe1b0, mode_flags=6) at iovsprintf.c:95", + "#9 0x00007ffff7e77edb in ___sprintf_chk (s=, flag=, slen=, format=) at sprintf_chk.c:40", + "#10 0x000055555555c7a1 in sprintf (__fmt=0x5555555613df \"%.4f\", __s=0x7fffffffe2a0 \"79725330432.000\") at /usr/include/x86_64-linux-gnu/bits/stdio2.h:36", + "#12 0x00005555555601b8 in t2p_write_pdf (output=0x555555568f80, input=0x555555567ea0, t2p=0x5555555672a0) at tiff2pdf.c:5175", + "#13 t2p_write_pdf (t2p=0x5555555672a0, input=0x555555567ea0, output=0x555555568f80) at tiff2pdf.c:5133", + "#14 0x00005555555568d4 in main (argc=, argv=) at tiff2pdf.c:763" + ] +} diff --git a/casr/tests/tests.rs b/casr/tests/tests.rs index 0b0924ec..2ec42ab1 100644 --- a/casr/tests/tests.rs +++ b/casr/tests/tests.rs @@ -2425,7 +2425,7 @@ fn test_casr_cluster_c() { .parse::() .unwrap(); - assert_eq!(before_cnt, 11, "Before count mismatch."); + assert_eq!(before_cnt, 12, "Before count mismatch."); let re = Regex::new(r"Number of reports after crashline deduplication: (?P\d+)").unwrap(); @@ -2438,15 +2438,16 @@ fn test_casr_cluster_c() { .parse::() .unwrap(); - assert_eq!(after_cnt, 10, "After count mismatch."); + assert_eq!(after_cnt, 11, "After count mismatch."); // 2.casrep and 20.caserp without crashlines => no dedup // 3.casrep and 30.caserp with crashlines => dedup - // Thus, cluster (cl8) with 2.casrep has 2 casreps and others have 1 casrep + // Thus, cluster (cl7) with 2.casrep has 2 casreps and cl9 too + // But others have 1 casrep for i in 1..clusters_cnt + 1 { let cluster_path = paths[1].to_owned() + "/cl" + &i.to_string(); let size = std::fs::read_dir(cluster_path.clone()).unwrap().count(); - let num = if i == 8 { 2 } else { 1 }; + let num = if i == 7 || i == 9 { 2 } else { 1 }; assert_eq!(size, num); } @@ -2654,8 +2655,10 @@ fn test_casr_cluster_u() { let paths = [ abs_path("tests/casr_tests/casrep/test_clustering_small"), abs_path("tests/tmp_tests_casr/clustering_out"), - abs_path("tests/tmp_tests_casr/clustering_out/cl8/20.casrep"), + abs_path("tests/tmp_tests_casr/clustering_out/cl7/20.casrep"), + abs_path("tests/tmp_tests_casr/clustering_out/cl8"), abs_path("tests/tmp_tests_casr/clustering_out/cl9"), + abs_path("tests/tmp_tests_casr/clustering_out/cl9/40.casrep"), ]; let _ = fs::remove_dir_all(&paths[1]); @@ -2690,7 +2693,9 @@ fn test_casr_cluster_u() { assert_eq!(clusters_cnt, 9, "Clusters count mismatch."); let _ = std::fs::remove_file(&paths[2]); + let _ = std::fs::remove_file(&paths[5]); let _ = std::fs::remove_dir_all(&paths[3]); + let _ = std::fs::rename(&paths[4], &paths[3]); let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) .args(["-u", &paths[0], &paths[1]]) @@ -2719,7 +2724,7 @@ fn test_casr_cluster_u() { .parse::() .unwrap(); - assert_eq!(added_cnt, 0, "Added count mismatch."); + assert_eq!(added_cnt, 1, "Added count mismatch."); let re = Regex::new(r"Number of duplicates: (?P\d+)").unwrap(); let duplicates_cnt = re @@ -2775,17 +2780,46 @@ fn test_casr_cluster_u() { assert_eq!(after_cnt, 1, "After count mismatch."); - let re = Regex::new(r"Cluster silhouette index: (?P\d+)").unwrap(); + let re = Regex::new(r"Cluster silhouette index: (?P\d+.\d+)").unwrap(); let sil = re .captures(&res) .unwrap() .name("sil") .map(|x| x.as_str()) .unwrap() - .parse::() + .parse::() + .unwrap(); + + assert_eq!(sil, 0.15436556855344655, "Silhouette index mismatch."); + + // Test estimation + let output = Command::new(*EXE_CASR_CLUSTER.read().unwrap()) + .args(["--estimate", &paths[1]]) + .output() + .expect("failed to start casr-cluster"); + + assert!( + output.status.success(), + "Stdout {}.\n Stderr: {}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + + let res = String::from_utf8_lossy(&output.stdout); + + assert!(!res.is_empty()); + + let re = Regex::new(r"Cluster silhouette index: (?P\d+.\d+)").unwrap(); + let sil = re + .captures(&res) + .unwrap() + .name("sil") + .map(|x| x.as_str()) + .unwrap() + .parse::() .unwrap(); - assert_eq!(sil, 0, "Silhouette index mismatch."); + assert_eq!(sil, 0.15436556855344655, "Silhouette index mismatch."); let _ = std::fs::remove_dir_all(&paths[1]); } diff --git a/libcasr/src/stacktrace.rs b/libcasr/src/stacktrace.rs index 2333801a..994365f3 100644 --- a/libcasr/src/stacktrace.rs +++ b/libcasr/src/stacktrace.rs @@ -62,7 +62,6 @@ pub enum AccumStrategy { } /// Structure provides an interface for leverages with CASR report clusters -#[derive(Clone, Debug)] pub struct Cluster { /// Cluster number pub number: usize, @@ -449,7 +448,7 @@ pub fn relation( /// "a" subcoefficient silhouette coefficient fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { let mut sum = 0f64; - for i in 0..stacktraces.len() - 1 { + for i in 0..stacktraces.len() { if i == num { continue; } @@ -474,12 +473,12 @@ fn sil_subcoef_a(num: usize, stacktraces: &[Stacktrace]) -> f64 { /// "b" subcoefficient silhouette coefficient fn sil_subcoef_b(num: usize, cl: usize, clusters: &[Vec]) -> f64 { let mut min = MAX; - for j in 0..clusters.len() - 1 { + for j in 0..clusters.len() { if j == cl { continue; } let mut sum = 0f64; - for i in 0..clusters[j].len() - 1 { + for i in 0..clusters[j].len() { sum += 1.0 - similarity(&clusters[cl][num], &clusters[j][i]); } let res = sum / clusters[j].len() as f64;