Improve qalpha benchmarking and documentation (#160)

This improves qalpha benchmarking by implementing - a more informative evaluation table, which can be saved to a TSV file; - aggregation of statistics over multiple trails of the same benchmark; - a categorization of benchmarks based on runtime; and - better documentation of the `qalpha` benchmarking command. This also expands and revises the documentation of the `infer qalpha` command and its arguments, and fixes the headers of some example files under `temporal-verifier/examples/fol`. This commit is also in sync with the CAV'24 artifact at https://doi.org/10.5281/zenodo.10938368 . --------- Signed-off-by: edenfrenkel <[email protected]> Signed-off-by: Tej Chajed <[email protected]> Co-authored-by: Tej Chajed <[email protected]>
flyvy-verifier · Apr 11, 2024 · a46670d · a46670d
1 parent 5a16ff0
commit a46670d
Show file tree

Hide file tree

Showing 44 changed files with 759 additions and 298 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/benchmarking/Cargo.toml b/benchmarking/Cargo.toml
@@ -4,6 +4,8 @@ version.workspace = true
 edition.workspace = true
 
 [dependencies]
+inference = { path = "../inference" }
+
 clap = { version = "4.3.4", features = ["derive"] }
 humantime = "2.1.0"
 nix = { version = "0.26.2", default-features = false, features = [
@@ -21,3 +23,4 @@ walkdir = "2.3.3"
 tabled = "0.13.0"
 glob = "0.3.1"
 itertools = "0.10.5"
+lazy-regex = "3.1.0"
diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -7,14 +7,26 @@ parameters.
 
 ## qalpha invariant inference
 
-Run `temporal-verifier infer qalpha` with hand-written configuration per
+Run `temporal-verifier infer qalpha` with a hand-written configuration per
 example. Records the full output of each run, as well as timing info. To
 configure this benchmark edit [qalpha.rs](src/qalpha.rs).
 
+Benchmark identifiers have the form `[fragment]/[category]/[filename]/[experiment-type]`,
+and can be filtered using the `-F` option. For example, this runs the `fixpoint` experiments
+in the `small` category and `epr` fragment, and should only take a few minutes.
+
+```sh
+cargo run --bin benchmark -- qalpha --dir qalpha-results -F "**/epr/small/**/fixpoint"
+```
+
+Loading and printing the saved results can then be done using `--load`:
+
 ```sh
-cargo run --bin benchmark -- qalpha --output-dir qalpha-results
+cargo run --bin benchmark -- qalpha --dir qalpha-results --load
 ```
 
+Add `--tsv <filename>` to save the printed table to a TSV file.
+
 ## Verification
 
 Run `temporal-verifier verify` over all the examples.

diff --git a/benchmarking/src/bin/benchmark.rs b/benchmarking/src/bin/benchmark.rs
@@ -3,11 +3,11 @@
 
 //! Run all of the examples as benchmarks and report the results in a table.
 
-use std::{path::PathBuf, time::Duration};
+use std::{fs, path::PathBuf, time::Duration};
 
 use benchmarking::{
-    qalpha::qalpha_benchmarks,
-    report,
+    qalpha::{qalpha_benchmarks, QalphaMeasurement},
+    report::{load_results, ReportableMeasurement},
     run::{get_examples, BenchmarkConfig, BenchmarkMeasurement},
     time_bench::{compile_flyvy_bin, compile_time_bin, REPO_ROOT_PATH},
 };
@@ -19,15 +19,21 @@ struct QalphaParams {
     /// Glob pattern over benchmark names to run
     #[arg(short = 'F', long = "filter", default_value = "*")]
     name_glob: String,
-    /// Output directory to store results
+    /// Directory to store results in, or load results from
     #[arg(long)]
-    output_dir: PathBuf,
+    dir: PathBuf,
     /// Time limit for running each experiment
     #[arg(long, default_value = "600s")]
     time_limit: humantime::Duration,
     /// Repeat each benchmark this number of times
     #[arg(short = 'R', long, default_value = "1")]
     repeat: usize,
+    /// Whether to run the benchmarks or load saved results
+    #[arg(long)]
+    load: bool,
+    /// File to output the results in TSV format to
+    #[arg(long)]
+    tsv: Option<String>,
 }
 
 #[derive(clap::Subcommand, Clone, Debug, PartialEq, Eq)]
@@ -44,13 +50,44 @@ enum Command {
         /// Output results in JSON format
         json: bool,
     },
-    /// Run invariant inference benchmarks with qalpha.
+    /// Run or load invariant inference benchmarks with qalpha and report the results in a table.
+    ///
+    /// The table produced by this command has one row per benchmark, and contains the following columns:
+    ///
+    /// - example: the name of the file defining the first-order transition system;
+    ///
+    /// - #: the number of trials performed for this benchmark;
+    ///
+    /// - LSet: "✓" if an LSet data-structure was used, or "-" if a baseline data-structure was used instead;
+    ///
+    /// - outcome: "timeout" if all trials in the benchmark timed-out, empty otherwise;
+    ///
+    /// - time (s): the total runtime of the least-fixpoint computation;
+    ///
+    /// - quantifiers: the quantifier structure of formulas in the first-order language considered;
+    ///
+    /// - qf body: the quantifier-free body of formulas in the first-order language considered.
+    /// In our experiments we use k-pDNF as defined by P-FOL-IC3, with a restricted number of literals in the pDNF clause;
+    ///
+    /// - language: the approximate number of formulas in the first-order language considered, induced by the transition system and the language parameters;
+    ///
+    /// - % weaken: the percentage of time spent weakening formulas during the run (as opposed to searching for counter-exmaples to induction);
+    ///
+    /// - lfp: the number of formulas in the computed least-fixpoint;
+    ///
+    /// - reduced: the number of formulas in a subset of the least-fixpoint sufficient to semantically imply the rest;
+    ///
+    /// - max: the maximal number of formulas in the representation of an abstract element encountered throughout the run;
+    ///
+    /// - × cpu: the CPU utilization factor;
+    ///
+    /// - memory: the maximal amount of RAM used throughout the run.
+    ///
+    /// Since we often run multiple trials of the same least-fixpoint computation, we report each of the variable statistics
+    /// (e.g., "time", "% weaken") as _median_ ± _deviation_. This is done by first computing the statistic for each individual trial,
+    /// finding the median, and then computing the maximal distance from the median value to the value in any trial —
+    /// which give us the deviation. This guarantees that the value of the statistic in all trials indeed falls within _median_ ± _deviation_.
     Qalpha(#[command(flatten)] QalphaParams),
-    /// Load and report a previously gathered set of results.
-    Report {
-        /// Directory to load from (recursively)
-        dir: PathBuf,
-    },
 }
 
 #[derive(clap::Parser, Debug)]
@@ -82,30 +119,19 @@ fn benchmark_verify(time_limit: Duration, solver: &str) -> Vec<BenchmarkMeasurem
         .collect()
 }
 
-fn run_qalpha(params: &QalphaParams) -> Vec<BenchmarkMeasurement> {
+fn run_qalpha(params: &QalphaParams) -> Vec<QalphaMeasurement> {
     let name_glob = Pattern::new(&params.name_glob).expect("could not parse pattern");
     qalpha_benchmarks(params.time_limit.into())
         .into_iter()
         .filter(|(name, _)| name_glob.matches_path(name))
-        .flat_map(|(file, config)| {
+        .map(|(file, config)| {
             eprintln!("qalpha {}", file.display());
-            (0..params.repeat).map(move |rep| {
-                let result = BenchmarkMeasurement::run(
-                    config.clone(),
-                    Some("info".to_string()),
-                    Some(params.output_dir.join(&file).join(format!("{rep}"))),
-                );
-                eprintln!(
-                    "  took {:0.0}s{}",
-                    result.measurement.real_time.as_secs_f64(),
-                    if result.measurement.success {
-                        ""
-                    } else {
-                        " (failed)"
-                    }
-                );
-                result
-            })
+            QalphaMeasurement::run(
+                config.clone(),
+                Some("info".to_string()),
+                params.dir.join(&file),
+                params.repeat,
+            )
         })
         .collect()
 }
@@ -126,15 +152,21 @@ impl App {
                 if *json {
                     println!("{}", BenchmarkMeasurement::to_json(&results));
                 } else {
-                    BenchmarkMeasurement::print_table(results);
+                    BenchmarkMeasurement::print_table(&results);
                 }
             }
             Command::Qalpha(params) => {
-                let _results = run_qalpha(params);
-            }
-            Command::Report { dir } => {
-                let results = report::load_results(dir);
-                BenchmarkMeasurement::print_table(results);
+                let results: Vec<QalphaMeasurement> = if params.load {
+                    load_results(&params.name_glob, &params.dir)
+                } else {
+                    run_qalpha(params)
+                };
+                if let Some(fname) = &params.tsv {
+                    let out = QalphaMeasurement::as_tsv(&results);
+                    fs::write(fname, out).expect("could not write tsv to file");
+                } else {
+                    QalphaMeasurement::print_table(&results);
+                }
             }
         }
     }