From c98d136ca2670c4c1acc0529e66df3a14f4bc97b Mon Sep 17 00:00:00 2001 From: Brendan Hansknecht Date: Mon, 18 Sep 2023 09:46:22 -0700 Subject: [PATCH 1/5] ensure correct assembly generation without extra data movement --- .../builtins/bitcode/benchmark/dec.zig | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig index 05d41ce8c89..c92ed6469c6 100644 --- a/crates/compiler/builtins/bitcode/benchmark/dec.zig +++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig @@ -24,12 +24,12 @@ pub fn main() !void { try stdout.print("Warning: Timer seems to step in units of 41ns\n\n", .{}); timer = try Timer.start(); - const n = 10000; + const n = 1000; // This number are very close to 1 to avoid over and underflow. const str1 = "1.00123"; const f1 = 1.00123; - const dec1 = RocDec.fromStr(RocStr.init(str1, 3)).?; + const dec1 = RocDec.fromStr(RocStr.init(str1, str1.len)).?; try stdout.print("Dec:\n", .{}); try stdout.print("{} additions took ", .{n}); @@ -78,15 +78,16 @@ pub fn main() !void { try stdout.print("asin: {d:0.2}\n", .{@intToFloat(f64, decAsin) / @intToFloat(f64, f64Asin)}); } -fn avg_runs(comptime T: type, comptime n: usize, op: fn (T, T) T, v: T) !u64 { +fn avg_runs(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) !u64 { const stdout = std.io.getStdOut().writer(); - const repeats = 1000; + const repeats = 10000; var runs = [_]u64{0} ** repeats; var i: usize = 0; while (i < repeats) : (i += 1) { - runs[i] = run(T, n, op, v); + // Never inline run to ensure it doesn't optimize for the value of `v`. + runs[i] = callWrapper(u64, .never_inline, run, .{ T, n, op, v }); } std.sort.sort(u64, &runs, {}, comptime std.sort.asc(u64)); @@ -99,43 +100,41 @@ fn avg_runs(comptime T: type, comptime n: usize, op: fn (T, T) T, v: T) !u64 { return median; } -fn run(comptime T: type, comptime n: usize, op: fn (T, T) T, v: T) u64 { +fn run(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) u64 { var a = v; timer.reset(); // Split into outer and inner loop to avoid breaking comptime. - comptime var outer = n / 500; - comptime var inner = std.math.min(n, 500); + const max_inline = 100; + comptime var outer = n / max_inline; + comptime var inner = std.math.min(n, max_inline); var i: usize = 0; while (i < outer) : (i += 1) { comptime var j = 0; inline while (j < inner) : (j += 1) { - a = op(a, v); - - // Clobber a to avoid optimizations and removal of dead code. - asm volatile ("" - : - : [a] "r,m" (&a), - : "memory" - ); + a = callWrapper(T, .always_inline, op, .{ a, v }); } } - comptime var rem = n % 500; - i = 0; - inline while (i < rem) : (i += 1) { - a = op(a, v); - - // Clobber a to avoid optimizations and removal of dead code. - asm volatile ("" - : - : [a] "r,m" (&a), - : "memory" - ); + const rem = n % max_inline; + comptime var j = 0; + inline while (j < rem) : (j += 1) { + a = callWrapper(T, .always_inline, op, .{ a, v }); } + // Clobber `a` to avoid removal as dead code. + asm volatile ("" + : + : [a] "r,m" (&a), + : "memory" + ); return timer.read(); } +// This is needed to work around a bug with using `@call` in loops. +inline fn callWrapper(comptime T: type, call_modifier: anytype, comptime func: anytype, params: anytype) T { + return @call(.{ .modifier = call_modifier }, func, params); +} + fn addF64(x: f64, y: f64) f64 { return x + y; } From a3ee58155cbadcfdf21d196616720dfa8f2b320c Mon Sep 17 00:00:00 2001 From: Brendan Hansknecht Date: Mon, 18 Sep 2023 10:02:39 -0700 Subject: [PATCH 2/5] add other trig functions to dec benchmark --- .../builtins/bitcode/benchmark/dec.zig | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig index c92ed6469c6..92fa47c5da0 100644 --- a/crates/compiler/builtins/bitcode/benchmark/dec.zig +++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig @@ -47,9 +47,21 @@ pub fn main() !void { try stdout.print("{} sin took ", .{n}); const decSin = try avg_runs(RocDec, n, sinDec, dec1); + try stdout.print("{} cos took ", .{n}); + const decCos = try avg_runs(RocDec, n, cosDec, dec1); + + try stdout.print("{} tan took ", .{n}); + const decTan = try avg_runs(RocDec, n, tanDec, dec1); + try stdout.print("{} asin took ", .{n}); const decAsin = try avg_runs(RocDec, n, asinDec, dec1); + try stdout.print("{} acos took ", .{n}); + const decAcos = try avg_runs(RocDec, n, acosDec, dec1); + + try stdout.print("{} atan took ", .{n}); + const decAtan = try avg_runs(RocDec, n, atanDec, dec1); + try stdout.print("\n\nF64:\n", .{}); try stdout.print("{} additions took ", .{n}); const f64Add = try avg_runs(f64, n, addF64, f1); @@ -66,16 +78,32 @@ pub fn main() !void { try stdout.print("{} sin took ", .{n}); const f64Sin = try avg_runs(f64, n, sinF64, f1); + try stdout.print("{} cos took ", .{n}); + const f64Cos = try avg_runs(f64, n, cosF64, f1); + + try stdout.print("{} tan took ", .{n}); + const f64Tan = try avg_runs(f64, n, tanF64, f1); + try stdout.print("{} asin took ", .{n}); const f64Asin = try avg_runs(f64, n, asinF64, f1); + try stdout.print("{} acos took ", .{n}); + const f64Acos = try avg_runs(f64, n, acosF64, f1); + + try stdout.print("{} atan took ", .{n}); + const f64Atan = try avg_runs(f64, n, atanF64, f1); + try stdout.print("\n\nDec/F64:\n", .{}); try stdout.print("addition: {d:0.2}\n", .{@intToFloat(f64, decAdd) / @intToFloat(f64, f64Add)}); try stdout.print("subtraction: {d:0.2}\n", .{@intToFloat(f64, decSub) / @intToFloat(f64, f64Sub)}); try stdout.print("multiplication: {d:0.2}\n", .{@intToFloat(f64, decMul) / @intToFloat(f64, f64Mul)}); try stdout.print("division: {d:0.2}\n", .{@intToFloat(f64, decDiv) / @intToFloat(f64, f64Div)}); try stdout.print("sin: {d:0.2}\n", .{@intToFloat(f64, decSin) / @intToFloat(f64, f64Sin)}); + try stdout.print("cos: {d:0.2}\n", .{@intToFloat(f64, decCos) / @intToFloat(f64, f64Cos)}); + try stdout.print("tan: {d:0.2}\n", .{@intToFloat(f64, decTan) / @intToFloat(f64, f64Tan)}); try stdout.print("asin: {d:0.2}\n", .{@intToFloat(f64, decAsin) / @intToFloat(f64, f64Asin)}); + try stdout.print("acos: {d:0.2}\n", .{@intToFloat(f64, decAcos) / @intToFloat(f64, f64Acos)}); + try stdout.print("atan: {d:0.2}\n", .{@intToFloat(f64, decAtan) / @intToFloat(f64, f64Atan)}); } fn avg_runs(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) !u64 { @@ -150,13 +178,37 @@ fn divF64(x: f64, y: f64) f64 { fn sinF64(x: f64, _: f64) f64 { return std.math.sin(x); } +fn cosF64(x: f64, _: f64) f64 { + return std.math.cos(x); +} +fn tanF64(x: f64, _: f64) f64 { + return std.math.tan(x); +} fn asinF64(x: f64, _: f64) f64 { return std.math.asin(x); } +fn acosF64(x: f64, _: f64) f64 { + return std.math.acos(x); +} +fn atanF64(x: f64, _: f64) f64 { + return std.math.atan(x); +} fn sinDec(x: RocDec, _: RocDec) RocDec { return x.sin(); } +fn cosDec(x: RocDec, _: RocDec) RocDec { + return x.cos(); +} +fn tanDec(x: RocDec, _: RocDec) RocDec { + return x.tan(); +} fn asinDec(x: RocDec, _: RocDec) RocDec { return x.asin(); } +fn acosDec(x: RocDec, _: RocDec) RocDec { + return x.acos(); +} +fn atanDec(x: RocDec, _: RocDec) RocDec { + return x.atan(); +} From aee54a44dcab6ff84eff0f86911a3759bc791b74 Mon Sep 17 00:00:00 2001 From: Brendan Hansknecht Date: Mon, 18 Sep 2023 10:34:59 -0700 Subject: [PATCH 3/5] fix asin and acos benchmark --- .../builtins/bitcode/benchmark/dec.zig | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig index 92fa47c5da0..ae119477f6b 100644 --- a/crates/compiler/builtins/bitcode/benchmark/dec.zig +++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig @@ -27,9 +27,12 @@ pub fn main() !void { const n = 1000; // This number are very close to 1 to avoid over and underflow. - const str1 = "1.00123"; const f1 = 1.00123; - const dec1 = RocDec.fromStr(RocStr.init(str1, str1.len)).?; + const dec1 = RocDec.fromF64(f1).?; + + // `asin` and `acos` have a limited range, so they will use this value. + const f2 = 0.00130000847; + const dec2 = RocDec.fromF64(f2).?; try stdout.print("Dec:\n", .{}); try stdout.print("{} additions took ", .{n}); @@ -54,10 +57,10 @@ pub fn main() !void { const decTan = try avg_runs(RocDec, n, tanDec, dec1); try stdout.print("{} asin took ", .{n}); - const decAsin = try avg_runs(RocDec, n, asinDec, dec1); + const decAsin = try avg_runs(RocDec, n, asinDec, dec2); try stdout.print("{} acos took ", .{n}); - const decAcos = try avg_runs(RocDec, n, acosDec, dec1); + const decAcos = try avg_runs(RocDec, n, acosDec, dec2); try stdout.print("{} atan took ", .{n}); const decAtan = try avg_runs(RocDec, n, atanDec, dec1); @@ -85,10 +88,10 @@ pub fn main() !void { const f64Tan = try avg_runs(f64, n, tanF64, f1); try stdout.print("{} asin took ", .{n}); - const f64Asin = try avg_runs(f64, n, asinF64, f1); + const f64Asin = try avg_runs(f64, n, asinF64, f2); try stdout.print("{} acos took ", .{n}); - const f64Acos = try avg_runs(f64, n, acosF64, f1); + const f64Acos = try avg_runs(f64, n, acosF64, f2); try stdout.print("{} atan took ", .{n}); const f64Atan = try avg_runs(f64, n, atanF64, f1); @@ -187,8 +190,11 @@ fn tanF64(x: f64, _: f64) f64 { fn asinF64(x: f64, _: f64) f64 { return std.math.asin(x); } +const pi_over_2 = std.math.pi / 2.0; fn acosF64(x: f64, _: f64) f64 { - return std.math.acos(x); + // acos is only stable if we subtract pi/2. + // The perf should be essentially the same because subtraction is much faster than acos. + return std.math.acos(x) - pi_over_2; } fn atanF64(x: f64, _: f64) f64 { return std.math.atan(x); @@ -206,8 +212,11 @@ fn tanDec(x: RocDec, _: RocDec) RocDec { fn asinDec(x: RocDec, _: RocDec) RocDec { return x.asin(); } +const pi_over_2_dec = RocDec.fromF64(pi_over_2).?; fn acosDec(x: RocDec, _: RocDec) RocDec { - return x.acos(); + // acos is only stable if we subtract pi/2. + // The perf should be essentially the same because subtraction is much faster than acos. + return x.acos().sub(pi_over_2_dec); } fn atanDec(x: RocDec, _: RocDec) RocDec { return x.atan(); From 9edd3eaca056fe838083c24d7d84437b6f728395 Mon Sep 17 00:00:00 2001 From: Brendan Hansknecht Date: Mon, 18 Sep 2023 11:02:49 -0700 Subject: [PATCH 4/5] more benchmark cleanup and warmup phase --- .../builtins/bitcode/benchmark/dec.zig | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig index ae119477f6b..996fb7458b1 100644 --- a/crates/compiler/builtins/bitcode/benchmark/dec.zig +++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig @@ -26,6 +26,9 @@ pub fn main() !void { const n = 1000; + // Add/Sub are too fast and need a higher n. + const add_sub_n = 10000; + // This number are very close to 1 to avoid over and underflow. const f1 = 1.00123; const dec1 = RocDec.fromF64(f1).?; @@ -35,11 +38,11 @@ pub fn main() !void { const dec2 = RocDec.fromF64(f2).?; try stdout.print("Dec:\n", .{}); - try stdout.print("{} additions took ", .{n}); - const decAdd = try avg_runs(RocDec, n, RocDec.add, dec1); + try stdout.print("{} additions took ", .{add_sub_n}); + const decAdd = try avg_runs(RocDec, add_sub_n, RocDec.add, dec1); - try stdout.print("{} subtractions took ", .{n}); - const decSub = try avg_runs(RocDec, n, RocDec.sub, dec1); + try stdout.print("{} subtractions took ", .{add_sub_n}); + const decSub = try avg_runs(RocDec, add_sub_n, RocDec.sub, dec1); try stdout.print("{} multiplications took ", .{n}); const decMul = try avg_runs(RocDec, n, RocDec.mul, dec1); @@ -66,11 +69,11 @@ pub fn main() !void { const decAtan = try avg_runs(RocDec, n, atanDec, dec1); try stdout.print("\n\nF64:\n", .{}); - try stdout.print("{} additions took ", .{n}); - const f64Add = try avg_runs(f64, n, addF64, f1); + try stdout.print("{} additions took ", .{add_sub_n}); + const f64Add = try avg_runs(f64, add_sub_n, addF64, f1); - try stdout.print("{} subtractions took ", .{n}); - const f64Sub = try avg_runs(f64, n, subF64, f1); + try stdout.print("{} subtractions took ", .{add_sub_n}); + const f64Sub = try avg_runs(f64, add_sub_n, subF64, f1); try stdout.print("{} multiplications took ", .{n}); const f64Mul = try avg_runs(f64, n, mulF64, f1); @@ -112,20 +115,22 @@ pub fn main() !void { fn avg_runs(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) !u64 { const stdout = std.io.getStdOut().writer(); + const warmups = 10000; const repeats = 10000; - var runs = [_]u64{0} ** repeats; + var runs = [_]u64{0} ** (warmups + repeats); var i: usize = 0; - while (i < repeats) : (i += 1) { + while (i < warmups + repeats) : (i += 1) { // Never inline run to ensure it doesn't optimize for the value of `v`. runs[i] = callWrapper(u64, .never_inline, run, .{ T, n, op, v }); } - std.sort.sort(u64, &runs, {}, comptime std.sort.asc(u64)); + var real_runs = runs[warmups..runs.len]; + std.sort.sort(u64, real_runs, {}, comptime std.sort.asc(u64)); - const median = runs[runs.len / 2]; - const highest = runs[runs.len - 1]; - const lowest = runs[0]; + const median = real_runs[real_runs.len / 2]; + const highest = real_runs[real_runs.len - 1]; + const lowest = real_runs[0]; try stdout.print("{}ns (lowest: {}ns, highest: {}ns)\n", .{ median, lowest, highest }); return median; From 5cfab36f75e666cd1d1bd096715b70da0b5fbe2e Mon Sep 17 00:00:00 2001 From: Brendan Hansknecht Date: Mon, 18 Sep 2023 11:07:37 -0700 Subject: [PATCH 5/5] update perf numbers --- crates/compiler/builtins/roc/Num.roc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/compiler/builtins/roc/Num.roc b/crates/compiler/builtins/roc/Num.roc index 890f14f49d6..72a63222272 100644 --- a/crates/compiler/builtins/roc/Num.roc +++ b/crates/compiler/builtins/roc/Num.roc @@ -514,12 +514,16 @@ F32 : Num (FloatingPoint Binary32) ## ## Here's a comparison of about how long [Dec] takes to perform a given operation compared to [F64], ## based on benchmarks on an [M1](https://en.wikipedia.org/wiki/Apple_M1) CPU: -## * [add] 0.75x -## * [sub] 0.75x -## * [mul] 4x -## * [div] 32x -## * [sin] 3x -## * [asin] 9x +## * [add] 0.6x +## * [sub] 0.6x +## * [mul] 15x +## * [div] 55x +## * [sin] 3.9x +## * [cos] 3.6x +## * [tan] 2.3x +## * [asin] 1.8x +## * [acos] 1.7x +## * [atan] 1.7x ## ## Keep in mind that arithmetic instructions are basically [the fastest thing a CPU does](http://norvig.com/21-days.html#answers), ## so (for example) a network request that takes 10 milliseconds to complete would go on this