Skip to content

Commit

Permalink
Improve qalpha benchmarking and documentation (#160)
Browse files Browse the repository at this point in the history
This improves qalpha benchmarking by implementing

- a more informative evaluation table, which can be saved to a TSV file;
- aggregation of statistics over multiple trails of the same benchmark;
- a categorization of benchmarks based on runtime; and
- better documentation of the `qalpha` benchmarking command.

This also expands and revises the documentation of the `infer qalpha`
command and its arguments, and fixes the headers of some example files
under `temporal-verifier/examples/fol`.

This commit is also in sync with the CAV'24 artifact at
https://doi.org/10.5281/zenodo.10938368 .

---------

Signed-off-by: edenfrenkel <[email protected]>
Signed-off-by: Tej Chajed <[email protected]>
Co-authored-by: Tej Chajed <[email protected]>
  • Loading branch information
edenfrenkel and tchajed authored Apr 11, 2024
1 parent 5a16ff0 commit a46670d
Show file tree
Hide file tree
Showing 44 changed files with 759 additions and 298 deletions.
30 changes: 28 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions benchmarking/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ version.workspace = true
edition.workspace = true

[dependencies]
inference = { path = "../inference" }

clap = { version = "4.3.4", features = ["derive"] }
humantime = "2.1.0"
nix = { version = "0.26.2", default-features = false, features = [
Expand All @@ -21,3 +23,4 @@ walkdir = "2.3.3"
tabled = "0.13.0"
glob = "0.3.1"
itertools = "0.10.5"
lazy-regex = "3.1.0"
16 changes: 14 additions & 2 deletions benchmarking/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,26 @@ parameters.

## qalpha invariant inference

Run `temporal-verifier infer qalpha` with hand-written configuration per
Run `temporal-verifier infer qalpha` with a hand-written configuration per
example. Records the full output of each run, as well as timing info. To
configure this benchmark edit [qalpha.rs](src/qalpha.rs).

Benchmark identifiers have the form `[fragment]/[category]/[filename]/[experiment-type]`,
and can be filtered using the `-F` option. For example, this runs the `fixpoint` experiments
in the `small` category and `epr` fragment, and should only take a few minutes.

```sh
cargo run --bin benchmark -- qalpha --dir qalpha-results -F "**/epr/small/**/fixpoint"
```

Loading and printing the saved results can then be done using `--load`:

```sh
cargo run --bin benchmark -- qalpha --output-dir qalpha-results
cargo run --bin benchmark -- qalpha --dir qalpha-results --load
```

Add `--tsv <filename>` to save the printed table to a TSV file.

## Verification

Run `temporal-verifier verify` over all the examples.
Expand Down
104 changes: 68 additions & 36 deletions benchmarking/src/bin/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

//! Run all of the examples as benchmarks and report the results in a table.

use std::{path::PathBuf, time::Duration};
use std::{fs, path::PathBuf, time::Duration};

use benchmarking::{
qalpha::qalpha_benchmarks,
report,
qalpha::{qalpha_benchmarks, QalphaMeasurement},
report::{load_results, ReportableMeasurement},
run::{get_examples, BenchmarkConfig, BenchmarkMeasurement},
time_bench::{compile_flyvy_bin, compile_time_bin, REPO_ROOT_PATH},
};
Expand All @@ -19,15 +19,21 @@ struct QalphaParams {
/// Glob pattern over benchmark names to run
#[arg(short = 'F', long = "filter", default_value = "*")]
name_glob: String,
/// Output directory to store results
/// Directory to store results in, or load results from
#[arg(long)]
output_dir: PathBuf,
dir: PathBuf,
/// Time limit for running each experiment
#[arg(long, default_value = "600s")]
time_limit: humantime::Duration,
/// Repeat each benchmark this number of times
#[arg(short = 'R', long, default_value = "1")]
repeat: usize,
/// Whether to run the benchmarks or load saved results
#[arg(long)]
load: bool,
/// File to output the results in TSV format to
#[arg(long)]
tsv: Option<String>,
}

#[derive(clap::Subcommand, Clone, Debug, PartialEq, Eq)]
Expand All @@ -44,13 +50,44 @@ enum Command {
/// Output results in JSON format
json: bool,
},
/// Run invariant inference benchmarks with qalpha.
/// Run or load invariant inference benchmarks with qalpha and report the results in a table.
///
/// The table produced by this command has one row per benchmark, and contains the following columns:
///
/// - example: the name of the file defining the first-order transition system;
///
/// - #: the number of trials performed for this benchmark;
///
/// - LSet: "✓" if an LSet data-structure was used, or "-" if a baseline data-structure was used instead;
///
/// - outcome: "timeout" if all trials in the benchmark timed-out, empty otherwise;
///
/// - time (s): the total runtime of the least-fixpoint computation;
///
/// - quantifiers: the quantifier structure of formulas in the first-order language considered;
///
/// - qf body: the quantifier-free body of formulas in the first-order language considered.
/// In our experiments we use k-pDNF as defined by P-FOL-IC3, with a restricted number of literals in the pDNF clause;
///
/// - language: the approximate number of formulas in the first-order language considered, induced by the transition system and the language parameters;
///
/// - % weaken: the percentage of time spent weakening formulas during the run (as opposed to searching for counter-exmaples to induction);
///
/// - lfp: the number of formulas in the computed least-fixpoint;
///
/// - reduced: the number of formulas in a subset of the least-fixpoint sufficient to semantically imply the rest;
///
/// - max: the maximal number of formulas in the representation of an abstract element encountered throughout the run;
///
/// - × cpu: the CPU utilization factor;
///
/// - memory: the maximal amount of RAM used throughout the run.
///
/// Since we often run multiple trials of the same least-fixpoint computation, we report each of the variable statistics
/// (e.g., "time", "% weaken") as _median_ ± _deviation_. This is done by first computing the statistic for each individual trial,
/// finding the median, and then computing the maximal distance from the median value to the value in any trial —
/// which give us the deviation. This guarantees that the value of the statistic in all trials indeed falls within _median_ ± _deviation_.
Qalpha(#[command(flatten)] QalphaParams),
/// Load and report a previously gathered set of results.
Report {
/// Directory to load from (recursively)
dir: PathBuf,
},
}

#[derive(clap::Parser, Debug)]
Expand Down Expand Up @@ -82,30 +119,19 @@ fn benchmark_verify(time_limit: Duration, solver: &str) -> Vec<BenchmarkMeasurem
.collect()
}

fn run_qalpha(params: &QalphaParams) -> Vec<BenchmarkMeasurement> {
fn run_qalpha(params: &QalphaParams) -> Vec<QalphaMeasurement> {
let name_glob = Pattern::new(&params.name_glob).expect("could not parse pattern");
qalpha_benchmarks(params.time_limit.into())
.into_iter()
.filter(|(name, _)| name_glob.matches_path(name))
.flat_map(|(file, config)| {
.map(|(file, config)| {
eprintln!("qalpha {}", file.display());
(0..params.repeat).map(move |rep| {
let result = BenchmarkMeasurement::run(
config.clone(),
Some("info".to_string()),
Some(params.output_dir.join(&file).join(format!("{rep}"))),
);
eprintln!(
" took {:0.0}s{}",
result.measurement.real_time.as_secs_f64(),
if result.measurement.success {
""
} else {
" (failed)"
}
);
result
})
QalphaMeasurement::run(
config.clone(),
Some("info".to_string()),
params.dir.join(&file),
params.repeat,
)
})
.collect()
}
Expand All @@ -126,15 +152,21 @@ impl App {
if *json {
println!("{}", BenchmarkMeasurement::to_json(&results));
} else {
BenchmarkMeasurement::print_table(results);
BenchmarkMeasurement::print_table(&results);
}
}
Command::Qalpha(params) => {
let _results = run_qalpha(params);
}
Command::Report { dir } => {
let results = report::load_results(dir);
BenchmarkMeasurement::print_table(results);
let results: Vec<QalphaMeasurement> = if params.load {
load_results(&params.name_glob, &params.dir)
} else {
run_qalpha(params)
};
if let Some(fname) = &params.tsv {
let out = QalphaMeasurement::as_tsv(&results);
fs::write(fname, out).expect("could not write tsv to file");
} else {
QalphaMeasurement::print_table(&results);
}
}
}
}
Expand Down
Loading

0 comments on commit a46670d

Please sign in to comment.