diff --git a/ci/run.sh b/ci/run.sh
index 847b5243..2eafd1b4 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -24,6 +24,7 @@ else
     run="cargo test --manifest-path testcrate/Cargo.toml --no-fail-fast --target $target"
     $run
     $run --release
+    $run --benches
     $run --features c
     $run --features c --release
     $run --features no-asm
diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml
index 6f771181..86bcb4b4 100644
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -21,6 +21,10 @@ path = ".."
 default-features = false
 features = ["public-test-deps"]
 
+[dev-dependencies]
+criterion = { version = "0.5.1", default-features = false }
+paste = "1.0.15"
+
 [target.'cfg(all(target_arch = "arm", not(any(target_env = "gnu", target_env = "musl")), target_os = "linux"))'.dev-dependencies]
 test = { git = "https://github.com/japaric/utest" }
 utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japaric/utest" }
@@ -35,3 +39,35 @@ mem = ["compiler_builtins/mem"]
 mangled-names = ["compiler_builtins/mangled-names"]
 # Skip tests that rely on f128 symbols being available on the system
 no-sys-f128 = []
+
+[[bench]]
+name = "float_add"
+harness = false
+
+[[bench]]
+name = "float_sub"
+harness = false
+
+[[bench]]
+name = "float_mul"
+harness = false
+
+[[bench]]
+name = "float_div"
+harness = false
+
+[[bench]]
+name = "float_cmp"
+harness = false
+
+[[bench]]
+name = "float_conv"
+harness = false
+
+[[bench]]
+name = "float_extend"
+harness = false
+
+[[bench]]
+name = "float_trunc"
+harness = false
diff --git a/testcrate/benches/float_add.rs b/testcrate/benches/float_add.rs
new file mode 100644
index 00000000..3eec169c
--- /dev/null
+++ b/testcrate/benches/float_add.rs
@@ -0,0 +1,59 @@
+#![feature(f128)]
+
+use compiler_builtins::float::add;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: add_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: add::__addsf3,
+    sys_fn: __addsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "addss xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fadd    s0, s0, s1",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: add_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: add::__adddf3,
+    sys_fn: __adddf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "addsd xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fadd    d0, d0, d1",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: add_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: add::__addtf3,
+    sys_fn: __addtf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+criterion_group!(float_add, add_f32, add_f64, add_f128);
+criterion_main!(float_add);
diff --git a/testcrate/benches/float_cmp.rs b/testcrate/benches/float_cmp.rs
new file mode 100644
index 00000000..5117b621
--- /dev/null
+++ b/testcrate/benches/float_cmp.rs
@@ -0,0 +1,131 @@
+#![feature(f128)]
+
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+use compiler_builtins::float::cmp;
+
+float_bench! {
+    name: cmp_f32_gt,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__gtsf2,
+    sys_fn: __gtsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "xor     eax, eax",
+            "ucomiss xmm0, xmm1",
+            "seta    al",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcmp    s0, s1",
+            "cset    w0, gt",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: cmp_f32_unord,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__unordsf2,
+    sys_fn: __unordsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cmpneqss xmm0, xmm1",
+            "movd     eax, xmm0",
+            "and      eax, 1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcmp    s0, s1",
+            "cset    w0, eq",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_gt,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__gtdf2,
+    sys_fn: __gtdf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "xor     eax, eax",
+            "ucomisd xmm0, xmm1",
+            "seta    al",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcmp    d0, d1",
+            "cset    w0, gt",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_unord,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__unorddf2,
+    sys_fn: __unorddf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cmpeqsd xmm0, xmm1",
+            "movq    rax, xmm0",
+            "and     eax, 1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcmp    d0, d1",
+            "cset    w0, eq",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: cmp_f128_gt,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__gttf2,
+    sys_fn: __gttf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+float_bench! {
+    name: cmp_f128_unord,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__unordtf2,
+    sys_fn: __unordtf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+criterion_group!(
+    float_cmp,
+    cmp_f32_gt,
+    cmp_f32_unord,
+    cmp_f64_gt,
+    cmp_f64_unord,
+    cmp_f128_gt,
+    cmp_f128_unord
+);
+criterion_main!(float_cmp);
diff --git a/testcrate/benches/float_conv.rs b/testcrate/benches/float_conv.rs
new file mode 100644
index 00000000..599816b0
--- /dev/null
+++ b/testcrate/benches/float_conv.rs
@@ -0,0 +1,406 @@
+#![feature(f128)]
+#![allow(improper_ctypes)]
+
+use compiler_builtins::float::conv;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+/* unsigned int -> float */
+
+float_bench! {
+    name: conv_u32_f32,
+    sig: (i: u32) -> f32,
+    crate_fn: conv::__floatunsisf,
+    sys_fn: __floatunsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(all(target_arch = "x86_64", not(target_family = "windows")))]
+        asm!(
+            "mov         eax, edi",
+            "cvtsi2ss    xmm0, rax",
+            "ret",
+        );
+
+        #[cfg(all(target_arch = "x86_64", target_family = "windows"))]
+        asm!(
+            "mov         eax, ecx",
+            "cvtsi2ss    xmm0, rax",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "ucvtf   s0, w0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_u32_f64,
+    sig: (i: u32) -> f64,
+    crate_fn: conv::__floatunsidf,
+    sys_fn: __floatunsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(all(target_arch = "x86_64", not(target_family = "windows")))]
+        asm!(
+            "mov         eax, edi",
+            "cvtsi2sd    xmm0, rax",
+            "ret",
+        );
+
+        #[cfg(all(target_arch = "x86_64", target_family = "windows"))]
+        asm!(
+            "mov         eax, ecx",
+            "cvtsi2sd    xmm0, rax",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "ucvtf   d0, w0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f32,
+    sig: (i: u64) -> f32,
+    crate_fn: conv::__floatundisf,
+    sys_fn: __floatundisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "ucvtf   s0, x0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f64,
+    sig: (i: u64) -> f64,
+    crate_fn: conv::__floatundidf,
+    sys_fn: __floatundidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "ucvtf   d0, x0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_u128_f32,
+    sig: (i: u128) -> f32,
+    crate_fn: conv::__floatuntisf,
+    sys_fn: __floatuntisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f64,
+    sig: (i: u128) -> f64,
+    crate_fn: conv::__floatuntidf,
+    sys_fn: __floatuntidf,
+    sys_available: all(),
+    asm: []
+}
+
+/* signed int -> float */
+
+float_bench! {
+    name: conv_i32_f32,
+    sig: (i: i32) -> f32,
+    crate_fn: conv::__floatsisf,
+    sys_fn: __floatsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cvtsi2ss        xmm0, edi",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "scvtf   s0, w0",
+            "ret",
+
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_i32_f64,
+    sig: (i: i32) -> f64,
+    crate_fn: conv::__floatsidf,
+    sys_fn: __floatsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cvtsi2sd        xmm0, edi",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "scvtf   d0, w0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f32,
+    sig: (i: i64) -> f32,
+    crate_fn: conv::__floatdisf,
+    sys_fn: __floatdisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cvtsi2ss        xmm0, rdi",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "scvtf   s0, x0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f64,
+    sig: (i: i64) -> f64,
+    crate_fn: conv::__floatdidf,
+    sys_fn: __floatdidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cvtsi2sd        xmm0, rdi",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "scvtf   d0, x0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_i128_f32,
+    sig: (i: i128) -> f32,
+    crate_fn: conv::__floattisf,
+    sys_fn: __floattisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f64,
+    sig: (i: i128) -> f64,
+    crate_fn: conv::__floattidf,
+    sys_fn: __floattidf,
+    sys_available: all(),
+    asm: []
+}
+
+/* float -> unsigned int */
+
+float_bench! {
+    name: conv_f32_u32,
+    sig: (f: f32) -> u32,
+    crate_fn: conv::__fixunssfsi,
+    sys_fn: __fixunssfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzu  w0, s0",
+           "ret",
+        );
+    ],
+}
+float_bench! {
+    name: conv_f32_u64,
+    sig: (f: f32) -> u64,
+    crate_fn: conv::__fixunssfdi,
+    sys_fn: __fixunssfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzu  x0, s0",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_f32_u128,
+    sig: (f: f32) -> u128,
+    crate_fn: conv::__fixunssfti,
+    sys_fn: __fixunssfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_u32,
+    sig: (f: f64) -> u32,
+    crate_fn: conv::__fixunsdfsi,
+    sys_fn: __fixunsdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzu  w0, d0",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u64,
+    sig: (f: f64) -> u64,
+    crate_fn: conv::__fixunsdfdi,
+    sys_fn: __fixunsdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzu  x0, d0",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u128,
+    sig: (f: f64) -> u128,
+    crate_fn: conv::__fixunsdfti,
+    sys_fn: __fixunsdfti,
+    sys_available: all(),
+    asm: []
+}
+
+/* float -> signed int */
+
+float_bench! {
+    name: conv_f32_i32,
+    sig: (f: f32) -> i32,
+    crate_fn: conv::__fixsfsi,
+    sys_fn: __fixsfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzs  w0, s0",
+           "ret",
+        );
+    ],
+}
+float_bench! {
+    name: conv_f32_i64,
+    sig: (f: f32) -> i64,
+    crate_fn: conv::__fixsfdi,
+    sys_fn: __fixsfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzs  x0, s0",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_f32_i128,
+    sig: (f: f32) -> i128,
+    crate_fn: conv::__fixsfti,
+    sys_fn: __fixsfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_i32,
+    sig: (f: f64) -> i32,
+    crate_fn: conv::__fixdfsi,
+    sys_fn: __fixdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzs  w0, d0",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i64,
+    sig: (f: f64) -> i64,
+    crate_fn: conv::__fixdfdi,
+    sys_fn: __fixdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fcvtzs  x0, d0",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i128,
+    sig: (f: f64) -> i128,
+    crate_fn: conv::__fixdfti,
+    sys_fn: __fixdfti,
+    sys_available: all(),
+    asm: []
+}
+
+criterion_group!(
+    float_conv,
+    conv_u32_f32,
+    conv_u32_f64,
+    conv_u64_f32,
+    conv_u64_f64,
+    conv_u128_f32,
+    conv_u128_f64,
+    conv_i32_f32,
+    conv_i32_f64,
+    conv_i64_f32,
+    conv_i64_f64,
+    conv_i128_f32,
+    conv_i128_f64,
+    conv_f32_u32,
+    conv_f32_u64,
+    conv_f32_u128,
+    conv_f32_i32,
+    conv_f32_i64,
+    conv_f32_i128,
+    conv_f64_u32,
+    conv_f64_u64,
+    conv_f64_u128,
+    conv_f64_i32,
+    conv_f64_i64,
+    conv_f64_i128,
+);
+criterion_main!(float_conv);
diff --git a/testcrate/benches/float_div.rs b/testcrate/benches/float_div.rs
new file mode 100644
index 00000000..79a4514f
--- /dev/null
+++ b/testcrate/benches/float_div.rs
@@ -0,0 +1,50 @@
+#![feature(f128)]
+
+use compiler_builtins::float::div;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: div_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: div::__divsf3,
+    sys_fn: __divsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "divss xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fdiv    s0, s0, s1",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: div_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: div::__divdf3,
+    sys_fn: __divdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "divsd xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fdiv    d0, d0, d1",
+           "ret",
+        );
+    ],
+}
+
+criterion_group!(float_div, div_f32, div_f64);
+criterion_main!(float_div);
diff --git a/testcrate/benches/float_extend.rs b/testcrate/benches/float_extend.rs
new file mode 100644
index 00000000..6985d242
--- /dev/null
+++ b/testcrate/benches/float_extend.rs
@@ -0,0 +1,73 @@
+#![feature(f128)]
+#![feature(f16)]
+
+use compiler_builtins::float::extend;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: extend_f16_f32,
+    sig: (f: f16) -> f32,
+    crate_fn: extend::__extendhfsf2,
+    sys_fn: __extendhfsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcvt    s0, h0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: extend_f16_f128,
+    sig: (f: f16) -> f128,
+    crate_fn: extend::__extendhftf2,
+    sys_fn: __extendhftf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+float_bench! {
+    name: extend_f32_f64,
+    sig: (f: f32) -> f64,
+    crate_fn: extend::__extendsfdf2,
+    sys_fn: __extendsfdf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcvt    d0, s0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: extend_f32_f128,
+    sig: (f: f32) -> f128,
+    crate_fn: extend::__extendsftf2,
+    sys_fn: __extendsftf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+float_bench! {
+    name: extend_f64_f128,
+    sig: (f: f64) -> f128,
+    crate_fn: extend::__extenddftf2,
+    sys_fn: __extenddftf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+criterion_group!(
+    float_extend,
+    extend_f16_f32,
+    extend_f16_f128,
+    extend_f32_f64,
+    extend_f32_f128,
+    extend_f64_f128,
+);
+criterion_main!(float_extend);
diff --git a/testcrate/benches/float_mul.rs b/testcrate/benches/float_mul.rs
new file mode 100644
index 00000000..daaeb20c
--- /dev/null
+++ b/testcrate/benches/float_mul.rs
@@ -0,0 +1,59 @@
+#![feature(f128)]
+
+use compiler_builtins::float::mul;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: mul_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: mul::__mulsf3,
+    sys_fn: __mulsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "mulss xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fmul    s0, s0, s1",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: mul_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: mul::__muldf3,
+    sys_fn: __muldf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "mulsd xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fmul    d0, d0, d1",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: mul_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: mul::__multf3,
+    sys_fn: __multf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+criterion_group!(float_mul, mul_f32, mul_f64, mul_f128);
+criterion_main!(float_mul);
diff --git a/testcrate/benches/float_sub.rs b/testcrate/benches/float_sub.rs
new file mode 100644
index 00000000..19b20a26
--- /dev/null
+++ b/testcrate/benches/float_sub.rs
@@ -0,0 +1,59 @@
+#![feature(f128)]
+
+use compiler_builtins::float::sub;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: sub_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: sub::__subsf3,
+    sys_fn: __subsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "subss xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fsub    s0, s0, s1",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: sub_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: sub::__subdf3,
+    sys_fn: __subdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "subsd xmm0, xmm1",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+           "fsub    d0, d0, d1",
+           "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: sub_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: sub::__subtf3,
+    sys_fn: __subtf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+criterion_group!(float_sub, sub_f32, sub_f64, sub_f128);
+criterion_main!(float_sub);
diff --git a/testcrate/benches/float_trunc.rs b/testcrate/benches/float_trunc.rs
new file mode 100644
index 00000000..860e6d84
--- /dev/null
+++ b/testcrate/benches/float_trunc.rs
@@ -0,0 +1,95 @@
+#![feature(f128)]
+#![feature(f16)]
+
+use compiler_builtins::float::trunc;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: trunc_f32_f16,
+    sig: (f: f32) -> f16,
+    crate_fn: trunc::__truncsfhf2,
+    sys_fn: __truncsfhf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcvt    h0, s0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: trunc_f64_f16,
+    sig: (f: f64) -> f16,
+    crate_fn: trunc::__truncdfhf2,
+    sys_fn: __truncdfhf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcvt    h0, d0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: trunc_f64_f32,
+    sig: (f: f64) -> f32,
+    crate_fn: trunc::__truncdfsf2,
+    sys_fn: __truncdfsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")]
+        asm!(
+            "cvtsd2ss        xmm0, xmm0",
+            "ret",
+        );
+
+        #[cfg(target_arch = "aarch64")]
+        asm!(
+            "fcvt    s0, d0",
+            "ret",
+        );
+    ],
+}
+
+float_bench! {
+    name: trunc_f128_f16,
+    sig: (f: f128) -> f16,
+    crate_fn: trunc::__trunctfhf2,
+    sys_fn: __trunctfhf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+float_bench! {
+    name: trunc_f128_f32,
+    sig: (f: f128) -> f32,
+    crate_fn: trunc::__trunctfsf2,
+    sys_fn: __trunctfsf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+float_bench! {
+    name: trunc_f128_f64,
+    sig: (f: f128) -> f64,
+    crate_fn: trunc::__trunctfdf2,
+    sys_fn: __trunctfdf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+criterion_group!(
+    float_trunc,
+    trunc_f32_f16,
+    trunc_f64_f16,
+    trunc_f64_f32,
+    trunc_f128_f16,
+    trunc_f128_f32,
+    trunc_f128_f64,
+);
+criterion_main!(float_trunc);
diff --git a/testcrate/src/bench.rs b/testcrate/src/bench.rs
new file mode 100644
index 00000000..fb7e5266
--- /dev/null
+++ b/testcrate/src/bench.rs
@@ -0,0 +1,268 @@
+use core::cell::RefCell;
+
+use alloc::vec::Vec;
+use compiler_builtins::float::Float;
+
+/// Fuzz with these many items to ensure equal functions
+pub const CHECK_ITER_ITEMS: u32 = 10_000;
+/// Benchmark with this many items to get a variety
+pub const BENCH_ITER_ITEMS: u32 = 500;
+
+/// Still run benchmarks but don't check correctness between compiler-builtins and
+/// system functions
+pub const SKIP_SYS_CHECKS: &[&str] = &[
+    // FIXME: some sort of precision error (tested on aarch64)
+    "extend_f16_f32",
+    "trunc_f32_f16",
+    // We return -1, system functions on x86 return -2
+    "cmp_f128_gt",
+    // FIXME: rounding error
+    // <https://github.com/rust-lang/compiler-builtins/issues/616#issuecomment-2121060728>
+    "mul_f128",
+    // System symbols do the wrong thing
+    // <https://github.com/rust-lang/compiler-builtins/issues/617>
+    "trunc_f64_f16",
+];
+
+/// Create a comparison of the system symbol, compiler_builtins, and optionally handwritten
+/// assembly.
+///
+/// `asm!` gets turned into global assembly, more or less a naked function.
+#[macro_export]
+macro_rules! float_bench {
+    (
+        // Name of this benchmark
+        name: $name:ident,
+        // The function signature to be tested
+        sig: ($($arg:ident: $arg_ty:ty),*) -> $ret_ty:ty,
+        // Path to the crate in compiler_builtins
+        crate_fn: $crate_fn:path,
+        // Name of the system symbol
+        sys_fn: $sys_fn:ident,
+        // Meta saying whether the system symbol is available
+        sys_available: $sys_available:meta,
+        // Assembly implementations, if any.
+        asm: [
+            $(
+                #[$asm_meta:meta]
+                asm!($($asm_tt:tt)*)
+            );*
+            $(;)?
+        ]
+        $(,)?
+    ) => {paste::paste! {
+        #[allow(dead_code)]
+        extern "C" {
+            /// Assembly function name
+            fn [<$name _asm>]($($arg: $arg_ty),*) -> $ret_ty;
+
+            /// Binding for the system function
+            fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+        }
+
+        $(
+            #[$asm_meta]
+            #[cfg(not(target_vendor = "apple"))]
+            core::arch::global_asm!(
+                concat!(".global ", stringify!([<$name _asm>])),
+                concat!(stringify!([<$name _asm>]), ":"),
+                $($asm_tt)*
+            );
+
+            #[$asm_meta]
+            #[cfg(target_vendor = "apple")]
+            core::arch::global_asm!(
+                // mac targets have a leading `_` in assembly symbol names
+                concat!(".global _", stringify!([<$name _asm>])),
+                concat!("_", stringify!([<$name _asm>]), ":"),
+                $($asm_tt)*
+            );
+        )*
+
+        fn $name(c: &mut Criterion) {
+            use core::hint::black_box;
+            use compiler_builtins::float::Float;
+            use $crate::bench::BenchType;
+
+            #[inline(never)] // equalize with external calls
+            fn crate_fn($($arg: $arg_ty),*) -> $ret_ty {
+                $crate_fn( $($arg),* )
+            }
+
+            #[inline(always)] // already a branch
+            #[cfg($sys_available)]
+            fn sys_fn($($arg: $arg_ty),*) -> $ret_ty {
+                unsafe { $sys_fn( $($arg),* ) }
+            }
+
+            #[inline(never)] // equalize with external calls
+            fn asm_fn($($arg: $arg_ty),*) -> $ret_ty {
+                unsafe { [<$name _asm>]( $($arg),* ) }
+            }
+
+            let testvec = <($($arg_ty),*)>::make_testvec($crate::bench::CHECK_ITER_ITEMS);
+            let benchvec= <($($arg_ty),*)>::make_testvec($crate::bench::BENCH_ITER_ITEMS);
+            let title = stringify!($name);
+
+            // Verify math lines up
+
+            #[cfg($sys_available)]
+            for ($($arg),*) in testvec.iter().copied() {
+                if $crate::bench::SKIP_SYS_CHECKS.contains(&title) {
+                    continue;
+                }
+
+                let crate_res = crate_fn($($arg),*);
+                let sys_res = sys_fn($($arg),*);
+                assert!(
+                    $ret_ty::check_eq(crate_res, sys_res),
+                    "{title}{:?}: crate: {crate_res:?}, sys: {sys_res:?}",
+                    ($($arg),* ,)
+                );
+            }
+
+            // use a binding to get around nested macro repetition
+            let do_asm_check = || {
+                for ($($arg),*) in testvec.iter().copied() {
+                    // FIXME: these fail for float multiplication
+                    // <https://github.com/rust-lang/compiler-builtins/issues/616>
+                    if title.contains("mul")
+                        // cmp is skipped because builtins do spaceship but assembly does
+                        // a single operation.
+                        || title.contains("cmp") {
+                        continue;
+                    }
+
+                    let crate_res = crate_fn($($arg),*);
+                    let asm_res = asm_fn($($arg),*);
+
+                    assert!(
+                        $ret_ty::check_eq(crate_res, asm_res),
+                        "{title}{:?}: crate: {crate_res:?}, asm: {asm_res:?}",
+                        ($($arg),* ,)
+                    );
+                }
+            };
+            $(
+                #[$asm_meta]
+                do_asm_check();
+            )*
+
+            c.bench_function(&format!("{title} compiler-builtins"), |b| {
+                b.iter(|| {
+                    for ($($arg),*) in benchvec.iter().copied() {
+                        black_box(crate_fn( $(black_box($arg)),* ));
+                    }
+                })
+            });
+
+            #[cfg($sys_available)]
+            c.bench_function(&format!("{title} system"), |b| {
+                b.iter(|| {
+                    for ($($arg),*) in benchvec.iter().copied() {
+                        black_box(sys_fn( $(black_box($arg)),* ));
+                    }
+                })
+            });
+
+            // use a binding to get around nested macro repetition
+            let mut do_asm_bench = || {
+                c.bench_function(&format!(
+                    "{title} assembly {} {}", std::env::consts::ARCH, std::env::consts::FAMILY
+                ), |b| {
+                    b.iter(|| {
+                        for ($($arg),*) in benchvec.iter().copied() {
+                            black_box(asm_fn( $(black_box($arg)),* ));
+                        }
+                    })
+                });
+            };
+            $(
+                #[$asm_meta]
+                do_asm_bench();
+            )*
+        }
+    }};
+
+    (@coalesce $a:ty, $b:ty) => { $a };
+    (@coalesce , $b:ty) => { $b };
+
+    // Default to float comparison
+    (@eq $f_ty:ty,) => {
+        <$f_ty as Float>::eq_repr
+    };
+    // Use normal eq if the return type is not a float
+    (@eq $f_ty:ty, $ret_ty:ty) => {
+        |a: $ret_ty, b: $ret_ty| a == b
+    };
+
+}
+
+/// A type used as either an input or output to/from a benchmark function.
+pub trait BenchType: Sized {
+    fn make_testvec(len: u32) -> Vec<Self>;
+    fn check_eq(a: Self, b: Self) -> bool;
+}
+
+macro_rules! impl_benchtype {
+    (float $($f_ty:ty),+) => {$(
+        impl BenchType for $f_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                Float::eq_repr(a, b)
+            }
+        }
+
+        impl BenchType for ($f_ty, $f_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+    (int $($i_ty:ty),+) => {$(
+        impl BenchType for $i_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                a == b
+            }
+        }
+
+        impl BenchType for ($i_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+}
+
+#[cfg(not(feature = "no-f16-f128"))]
+impl_benchtype!(float f16, f128);
+impl_benchtype!(float f32, f64);
+impl_benchtype!(int i16, i32, i64, i128);
+impl_benchtype!(int u16, u32, u64, u128);
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 1f3a4b82..66a684d3 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -13,6 +13,11 @@
 //! Some floating point tests are disabled for specific architectures, because they do not have
 //! correct rounding.
 #![no_std]
+#![cfg_attr(not(feature = "no-f16-f128"), feature(f128))]
+#![cfg_attr(not(feature = "no-f16-f128"), feature(f16))]
+
+pub mod bench;
+extern crate alloc;
 
 use compiler_builtins::float::Float;
 use compiler_builtins::int::{Int, MinInt};