From c98d136ca2670c4c1acc0529e66df3a14f4bc97b Mon Sep 17 00:00:00 2001
From: Brendan Hansknecht <brendan.hansknecht@gmail.com>
Date: Mon, 18 Sep 2023 09:46:22 -0700
Subject: [PATCH 1/5] ensure correct assembly generation without extra data
 movement

---
 .../builtins/bitcode/benchmark/dec.zig        | 53 +++++++++----------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig
index 05d41ce8c89..c92ed6469c6 100644
--- a/crates/compiler/builtins/bitcode/benchmark/dec.zig
+++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig
@@ -24,12 +24,12 @@ pub fn main() !void {
     try stdout.print("Warning: Timer seems to step in units of 41ns\n\n", .{});
     timer = try Timer.start();
 
-    const n = 10000;
+    const n = 1000;
 
     // This number are very close to 1 to avoid over and underflow.
     const str1 = "1.00123";
     const f1 = 1.00123;
-    const dec1 = RocDec.fromStr(RocStr.init(str1, 3)).?;
+    const dec1 = RocDec.fromStr(RocStr.init(str1, str1.len)).?;
 
     try stdout.print("Dec:\n", .{});
     try stdout.print("{} additions took ", .{n});
@@ -78,15 +78,16 @@ pub fn main() !void {
     try stdout.print("asin:           {d:0.2}\n", .{@intToFloat(f64, decAsin) / @intToFloat(f64, f64Asin)});
 }
 
-fn avg_runs(comptime T: type, comptime n: usize, op: fn (T, T) T, v: T) !u64 {
+fn avg_runs(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) !u64 {
     const stdout = std.io.getStdOut().writer();
 
-    const repeats = 1000;
+    const repeats = 10000;
     var runs = [_]u64{0} ** repeats;
 
     var i: usize = 0;
     while (i < repeats) : (i += 1) {
-        runs[i] = run(T, n, op, v);
+        // Never inline run to ensure it doesn't optimize for the value of `v`.
+        runs[i] = callWrapper(u64, .never_inline, run, .{ T, n, op, v });
     }
 
     std.sort.sort(u64, &runs, {}, comptime std.sort.asc(u64));
@@ -99,43 +100,41 @@ fn avg_runs(comptime T: type, comptime n: usize, op: fn (T, T) T, v: T) !u64 {
     return median;
 }
 
-fn run(comptime T: type, comptime n: usize, op: fn (T, T) T, v: T) u64 {
+fn run(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) u64 {
     var a = v;
     timer.reset();
 
     // Split into outer and inner loop to avoid breaking comptime.
-    comptime var outer = n / 500;
-    comptime var inner = std.math.min(n, 500);
+    const max_inline = 100;
+    comptime var outer = n / max_inline;
+    comptime var inner = std.math.min(n, max_inline);
     var i: usize = 0;
     while (i < outer) : (i += 1) {
         comptime var j = 0;
         inline while (j < inner) : (j += 1) {
-            a = op(a, v);
-
-            // Clobber a to avoid optimizations and removal of dead code.
-            asm volatile (""
-                :
-                : [a] "r,m" (&a),
-                : "memory"
-            );
+            a = callWrapper(T, .always_inline, op, .{ a, v });
         }
     }
-    comptime var rem = n % 500;
-    i = 0;
-    inline while (i < rem) : (i += 1) {
-        a = op(a, v);
-
-        // Clobber a to avoid optimizations and removal of dead code.
-        asm volatile (""
-            :
-            : [a] "r,m" (&a),
-            : "memory"
-        );
+    const rem = n % max_inline;
+    comptime var j = 0;
+    inline while (j < rem) : (j += 1) {
+        a = callWrapper(T, .always_inline, op, .{ a, v });
     }
 
+    // Clobber `a` to avoid removal as dead code.
+    asm volatile (""
+        :
+        : [a] "r,m" (&a),
+        : "memory"
+    );
     return timer.read();
 }
 
+// This is needed to work around a bug with using `@call` in loops.
+inline fn callWrapper(comptime T: type, call_modifier: anytype, comptime func: anytype, params: anytype) T {
+    return @call(.{ .modifier = call_modifier }, func, params);
+}
+
 fn addF64(x: f64, y: f64) f64 {
     return x + y;
 }

From a3ee58155cbadcfdf21d196616720dfa8f2b320c Mon Sep 17 00:00:00 2001
From: Brendan Hansknecht <brendan.hansknecht@gmail.com>
Date: Mon, 18 Sep 2023 10:02:39 -0700
Subject: [PATCH 2/5] add other trig functions to dec benchmark

---
 .../builtins/bitcode/benchmark/dec.zig        | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig
index c92ed6469c6..92fa47c5da0 100644
--- a/crates/compiler/builtins/bitcode/benchmark/dec.zig
+++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig
@@ -47,9 +47,21 @@ pub fn main() !void {
     try stdout.print("{} sin took ", .{n});
     const decSin = try avg_runs(RocDec, n, sinDec, dec1);
 
+    try stdout.print("{} cos took ", .{n});
+    const decCos = try avg_runs(RocDec, n, cosDec, dec1);
+
+    try stdout.print("{} tan took ", .{n});
+    const decTan = try avg_runs(RocDec, n, tanDec, dec1);
+
     try stdout.print("{} asin took ", .{n});
     const decAsin = try avg_runs(RocDec, n, asinDec, dec1);
 
+    try stdout.print("{} acos took ", .{n});
+    const decAcos = try avg_runs(RocDec, n, acosDec, dec1);
+
+    try stdout.print("{} atan took ", .{n});
+    const decAtan = try avg_runs(RocDec, n, atanDec, dec1);
+
     try stdout.print("\n\nF64:\n", .{});
     try stdout.print("{} additions took ", .{n});
     const f64Add = try avg_runs(f64, n, addF64, f1);
@@ -66,16 +78,32 @@ pub fn main() !void {
     try stdout.print("{} sin took ", .{n});
     const f64Sin = try avg_runs(f64, n, sinF64, f1);
 
+    try stdout.print("{} cos took ", .{n});
+    const f64Cos = try avg_runs(f64, n, cosF64, f1);
+
+    try stdout.print("{} tan took ", .{n});
+    const f64Tan = try avg_runs(f64, n, tanF64, f1);
+
     try stdout.print("{} asin took ", .{n});
     const f64Asin = try avg_runs(f64, n, asinF64, f1);
 
+    try stdout.print("{} acos took ", .{n});
+    const f64Acos = try avg_runs(f64, n, acosF64, f1);
+
+    try stdout.print("{} atan took ", .{n});
+    const f64Atan = try avg_runs(f64, n, atanF64, f1);
+
     try stdout.print("\n\nDec/F64:\n", .{});
     try stdout.print("addition:       {d:0.2}\n", .{@intToFloat(f64, decAdd) / @intToFloat(f64, f64Add)});
     try stdout.print("subtraction:    {d:0.2}\n", .{@intToFloat(f64, decSub) / @intToFloat(f64, f64Sub)});
     try stdout.print("multiplication: {d:0.2}\n", .{@intToFloat(f64, decMul) / @intToFloat(f64, f64Mul)});
     try stdout.print("division:       {d:0.2}\n", .{@intToFloat(f64, decDiv) / @intToFloat(f64, f64Div)});
     try stdout.print("sin:            {d:0.2}\n", .{@intToFloat(f64, decSin) / @intToFloat(f64, f64Sin)});
+    try stdout.print("cos:            {d:0.2}\n", .{@intToFloat(f64, decCos) / @intToFloat(f64, f64Cos)});
+    try stdout.print("tan:            {d:0.2}\n", .{@intToFloat(f64, decTan) / @intToFloat(f64, f64Tan)});
     try stdout.print("asin:           {d:0.2}\n", .{@intToFloat(f64, decAsin) / @intToFloat(f64, f64Asin)});
+    try stdout.print("acos:           {d:0.2}\n", .{@intToFloat(f64, decAcos) / @intToFloat(f64, f64Acos)});
+    try stdout.print("atan:           {d:0.2}\n", .{@intToFloat(f64, decAtan) / @intToFloat(f64, f64Atan)});
 }
 
 fn avg_runs(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) !u64 {
@@ -150,13 +178,37 @@ fn divF64(x: f64, y: f64) f64 {
 fn sinF64(x: f64, _: f64) f64 {
     return std.math.sin(x);
 }
+fn cosF64(x: f64, _: f64) f64 {
+    return std.math.cos(x);
+}
+fn tanF64(x: f64, _: f64) f64 {
+    return std.math.tan(x);
+}
 fn asinF64(x: f64, _: f64) f64 {
     return std.math.asin(x);
 }
+fn acosF64(x: f64, _: f64) f64 {
+    return std.math.acos(x);
+}
+fn atanF64(x: f64, _: f64) f64 {
+    return std.math.atan(x);
+}
 
 fn sinDec(x: RocDec, _: RocDec) RocDec {
     return x.sin();
 }
+fn cosDec(x: RocDec, _: RocDec) RocDec {
+    return x.cos();
+}
+fn tanDec(x: RocDec, _: RocDec) RocDec {
+    return x.tan();
+}
 fn asinDec(x: RocDec, _: RocDec) RocDec {
     return x.asin();
 }
+fn acosDec(x: RocDec, _: RocDec) RocDec {
+    return x.acos();
+}
+fn atanDec(x: RocDec, _: RocDec) RocDec {
+    return x.atan();
+}

From aee54a44dcab6ff84eff0f86911a3759bc791b74 Mon Sep 17 00:00:00 2001
From: Brendan Hansknecht <brendan.hansknecht@gmail.com>
Date: Mon, 18 Sep 2023 10:34:59 -0700
Subject: [PATCH 3/5] fix asin and acos benchmark

---
 .../builtins/bitcode/benchmark/dec.zig        | 25 +++++++++++++------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig
index 92fa47c5da0..ae119477f6b 100644
--- a/crates/compiler/builtins/bitcode/benchmark/dec.zig
+++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig
@@ -27,9 +27,12 @@ pub fn main() !void {
     const n = 1000;
 
     // This number are very close to 1 to avoid over and underflow.
-    const str1 = "1.00123";
     const f1 = 1.00123;
-    const dec1 = RocDec.fromStr(RocStr.init(str1, str1.len)).?;
+    const dec1 = RocDec.fromF64(f1).?;
+
+    // `asin` and `acos` have a limited range, so they will use this value.
+    const f2 = 0.00130000847;
+    const dec2 = RocDec.fromF64(f2).?;
 
     try stdout.print("Dec:\n", .{});
     try stdout.print("{} additions took ", .{n});
@@ -54,10 +57,10 @@ pub fn main() !void {
     const decTan = try avg_runs(RocDec, n, tanDec, dec1);
 
     try stdout.print("{} asin took ", .{n});
-    const decAsin = try avg_runs(RocDec, n, asinDec, dec1);
+    const decAsin = try avg_runs(RocDec, n, asinDec, dec2);
 
     try stdout.print("{} acos took ", .{n});
-    const decAcos = try avg_runs(RocDec, n, acosDec, dec1);
+    const decAcos = try avg_runs(RocDec, n, acosDec, dec2);
 
     try stdout.print("{} atan took ", .{n});
     const decAtan = try avg_runs(RocDec, n, atanDec, dec1);
@@ -85,10 +88,10 @@ pub fn main() !void {
     const f64Tan = try avg_runs(f64, n, tanF64, f1);
 
     try stdout.print("{} asin took ", .{n});
-    const f64Asin = try avg_runs(f64, n, asinF64, f1);
+    const f64Asin = try avg_runs(f64, n, asinF64, f2);
 
     try stdout.print("{} acos took ", .{n});
-    const f64Acos = try avg_runs(f64, n, acosF64, f1);
+    const f64Acos = try avg_runs(f64, n, acosF64, f2);
 
     try stdout.print("{} atan took ", .{n});
     const f64Atan = try avg_runs(f64, n, atanF64, f1);
@@ -187,8 +190,11 @@ fn tanF64(x: f64, _: f64) f64 {
 fn asinF64(x: f64, _: f64) f64 {
     return std.math.asin(x);
 }
+const pi_over_2 = std.math.pi / 2.0;
 fn acosF64(x: f64, _: f64) f64 {
-    return std.math.acos(x);
+    // acos is only stable if we subtract pi/2.
+    // The perf should be essentially the same because subtraction is much faster than acos.
+    return std.math.acos(x) - pi_over_2;
 }
 fn atanF64(x: f64, _: f64) f64 {
     return std.math.atan(x);
@@ -206,8 +212,11 @@ fn tanDec(x: RocDec, _: RocDec) RocDec {
 fn asinDec(x: RocDec, _: RocDec) RocDec {
     return x.asin();
 }
+const pi_over_2_dec = RocDec.fromF64(pi_over_2).?;
 fn acosDec(x: RocDec, _: RocDec) RocDec {
-    return x.acos();
+    // acos is only stable if we subtract pi/2.
+    // The perf should be essentially the same because subtraction is much faster than acos.
+    return x.acos().sub(pi_over_2_dec);
 }
 fn atanDec(x: RocDec, _: RocDec) RocDec {
     return x.atan();

From 9edd3eaca056fe838083c24d7d84437b6f728395 Mon Sep 17 00:00:00 2001
From: Brendan Hansknecht <brendan.hansknecht@gmail.com>
Date: Mon, 18 Sep 2023 11:02:49 -0700
Subject: [PATCH 4/5] more benchmark cleanup and warmup phase

---
 .../builtins/bitcode/benchmark/dec.zig        | 33 +++++++++++--------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/crates/compiler/builtins/bitcode/benchmark/dec.zig b/crates/compiler/builtins/bitcode/benchmark/dec.zig
index ae119477f6b..996fb7458b1 100644
--- a/crates/compiler/builtins/bitcode/benchmark/dec.zig
+++ b/crates/compiler/builtins/bitcode/benchmark/dec.zig
@@ -26,6 +26,9 @@ pub fn main() !void {
 
     const n = 1000;
 
+    // Add/Sub are too fast and need a higher n.
+    const add_sub_n = 10000;
+
     // This number are very close to 1 to avoid over and underflow.
     const f1 = 1.00123;
     const dec1 = RocDec.fromF64(f1).?;
@@ -35,11 +38,11 @@ pub fn main() !void {
     const dec2 = RocDec.fromF64(f2).?;
 
     try stdout.print("Dec:\n", .{});
-    try stdout.print("{} additions took ", .{n});
-    const decAdd = try avg_runs(RocDec, n, RocDec.add, dec1);
+    try stdout.print("{} additions took ", .{add_sub_n});
+    const decAdd = try avg_runs(RocDec, add_sub_n, RocDec.add, dec1);
 
-    try stdout.print("{} subtractions took ", .{n});
-    const decSub = try avg_runs(RocDec, n, RocDec.sub, dec1);
+    try stdout.print("{} subtractions took ", .{add_sub_n});
+    const decSub = try avg_runs(RocDec, add_sub_n, RocDec.sub, dec1);
 
     try stdout.print("{} multiplications took ", .{n});
     const decMul = try avg_runs(RocDec, n, RocDec.mul, dec1);
@@ -66,11 +69,11 @@ pub fn main() !void {
     const decAtan = try avg_runs(RocDec, n, atanDec, dec1);
 
     try stdout.print("\n\nF64:\n", .{});
-    try stdout.print("{} additions took ", .{n});
-    const f64Add = try avg_runs(f64, n, addF64, f1);
+    try stdout.print("{} additions took ", .{add_sub_n});
+    const f64Add = try avg_runs(f64, add_sub_n, addF64, f1);
 
-    try stdout.print("{} subtractions took ", .{n});
-    const f64Sub = try avg_runs(f64, n, subF64, f1);
+    try stdout.print("{} subtractions took ", .{add_sub_n});
+    const f64Sub = try avg_runs(f64, add_sub_n, subF64, f1);
 
     try stdout.print("{} multiplications took ", .{n});
     const f64Mul = try avg_runs(f64, n, mulF64, f1);
@@ -112,20 +115,22 @@ pub fn main() !void {
 fn avg_runs(comptime T: type, comptime n: usize, comptime op: fn (T, T) T, v: T) !u64 {
     const stdout = std.io.getStdOut().writer();
 
+    const warmups = 10000;
     const repeats = 10000;
-    var runs = [_]u64{0} ** repeats;
+    var runs = [_]u64{0} ** (warmups + repeats);
 
     var i: usize = 0;
-    while (i < repeats) : (i += 1) {
+    while (i < warmups + repeats) : (i += 1) {
         // Never inline run to ensure it doesn't optimize for the value of `v`.
         runs[i] = callWrapper(u64, .never_inline, run, .{ T, n, op, v });
     }
 
-    std.sort.sort(u64, &runs, {}, comptime std.sort.asc(u64));
+    var real_runs = runs[warmups..runs.len];
+    std.sort.sort(u64, real_runs, {}, comptime std.sort.asc(u64));
 
-    const median = runs[runs.len / 2];
-    const highest = runs[runs.len - 1];
-    const lowest = runs[0];
+    const median = real_runs[real_runs.len / 2];
+    const highest = real_runs[real_runs.len - 1];
+    const lowest = real_runs[0];
 
     try stdout.print("{}ns (lowest: {}ns, highest: {}ns)\n", .{ median, lowest, highest });
     return median;

From 5cfab36f75e666cd1d1bd096715b70da0b5fbe2e Mon Sep 17 00:00:00 2001
From: Brendan Hansknecht <brendan.hansknecht@gmail.com>
Date: Mon, 18 Sep 2023 11:07:37 -0700
Subject: [PATCH 5/5] update perf numbers

---
 crates/compiler/builtins/roc/Num.roc | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/crates/compiler/builtins/roc/Num.roc b/crates/compiler/builtins/roc/Num.roc
index 890f14f49d6..72a63222272 100644
--- a/crates/compiler/builtins/roc/Num.roc
+++ b/crates/compiler/builtins/roc/Num.roc
@@ -514,12 +514,16 @@ F32 : Num (FloatingPoint Binary32)
 ##
 ## Here's a comparison of about how long [Dec] takes to perform a given operation compared to [F64],
 ## based on benchmarks on an [M1](https://en.wikipedia.org/wiki/Apple_M1) CPU:
-## * [add] 0.75x
-## * [sub] 0.75x
-## * [mul] 4x
-## * [div] 32x
-## * [sin] 3x
-## * [asin] 9x
+## * [add]  0.6x
+## * [sub]  0.6x
+## * [mul]  15x
+## * [div]  55x
+## * [sin]  3.9x
+## * [cos]  3.6x
+## * [tan]  2.3x
+## * [asin] 1.8x
+## * [acos] 1.7x
+## * [atan] 1.7x
 ##
 ## Keep in mind that arithmetic instructions are basically [the fastest thing a CPU does](http://norvig.com/21-days.html#answers),
 ## so (for example) a network request that takes 10 milliseconds to complete would go on this