diff --git a/src/float/conv.rs b/src/float/conv.rs
index 790c0ab9..fe996bbf 100644
--- a/src/float/conv.rs
+++ b/src/float/conv.rs
@@ -5,6 +5,7 @@
 ///
 /// The algorithm is explained here: <https://blog.m-ou.se/floats/>
 mod int_to_float {
+    #[inline(always)]
     pub fn u32_to_f32_bits(i: u32) -> u32 {
         if i == 0 {
             return 0;
@@ -17,6 +18,7 @@ mod int_to_float {
         (e << 23) + m // + not |, so the mantissa can overflow into the exponent.
     }
 
+    #[inline(always)]
     pub fn u32_to_f64_bits(i: u32) -> u64 {
         if i == 0 {
             return 0;
@@ -27,6 +29,7 @@ mod int_to_float {
         (e << 52) + m // Bit 53 of m will overflow into e.
     }
 
+    #[inline(always)]
     pub fn u64_to_f32_bits(i: u64) -> u32 {
         let n = i.leading_zeros();
         let y = i.wrapping_shl(n);
@@ -37,6 +40,7 @@ mod int_to_float {
         (e << 23) + m // + not |, so the mantissa can overflow into the exponent.
     }
 
+    #[inline(always)]
     pub fn u64_to_f64_bits(i: u64) -> u64 {
         if i == 0 {
             return 0;
@@ -49,6 +53,7 @@ mod int_to_float {
         (e << 52) + m // + not |, so the mantissa can overflow into the exponent.
     }
 
+    #[inline(always)]
     pub fn u128_to_f32_bits(i: u128) -> u32 {
         let n = i.leading_zeros();
         let y = i.wrapping_shl(n);
@@ -59,6 +64,7 @@ mod int_to_float {
         (e << 23) + m // + not |, so the mantissa can overflow into the exponent.
     }
 
+    #[inline(always)]
     pub fn u128_to_f64_bits(i: u128) -> u64 {
         let n = i.leading_zeros();
         let y = i.wrapping_shl(n);
diff --git a/src/float/mod.rs b/src/float/mod.rs
index fdbe9dde..20a70b0f 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -120,12 +120,15 @@ macro_rules! float_impl {
             const IMPLICIT_BIT: Self::Int = 1 << Self::SIGNIFICAND_BITS;
             const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK);
 
+            #[inline(always)]
             fn repr(self) -> Self::Int {
                 self.to_bits()
             }
+            #[inline(always)]
             fn signed_repr(self) -> Self::SignedInt {
                 self.to_bits() as Self::SignedInt
             }
+            #[inline(always)]
             fn eq_repr(self, rhs: Self) -> bool {
                 if self.is_nan() && rhs.is_nan() {
                     true
@@ -133,21 +136,27 @@ macro_rules! float_impl {
                     self.repr() == rhs.repr()
                 }
             }
+            #[inline(always)]
             fn sign(self) -> bool {
                 self.signed_repr() < Self::SignedInt::ZERO
             }
+            #[inline(always)]
             fn exp(self) -> Self::ExpInt {
                 ((self.to_bits() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS) as Self::ExpInt
             }
+            #[inline(always)]
             fn frac(self) -> Self::Int {
                 self.to_bits() & Self::SIGNIFICAND_MASK
             }
+            #[inline(always)]
             fn imp_frac(self) -> Self::Int {
                 self.frac() | Self::IMPLICIT_BIT
             }
+            #[inline(always)]
             fn from_repr(a: Self::Int) -> Self {
                 Self::from_bits(a)
             }
+            #[inline(always)]
             fn from_parts(sign: bool, exponent: Self::Int, significand: Self::Int) -> Self {
                 Self::from_repr(
                     ((sign as Self::Int) << (Self::BITS - 1))
@@ -155,6 +164,7 @@ macro_rules! float_impl {
                         | (significand & Self::SIGNIFICAND_MASK),
                 )
             }
+            #[inline(always)]
             fn normalize(significand: Self::Int) -> (i32, Self::Int) {
                 let shift = significand
                     .leading_zeros()
@@ -164,6 +174,7 @@ macro_rules! float_impl {
                     significand << shift as Self::Int,
                 )
             }
+            #[inline(always)]
             fn is_subnormal(self) -> bool {
                 (self.repr() & Self::EXPONENT_MASK) == Self::Int::ZERO
             }
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 509f9fda..ee1742b9 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -151,50 +151,62 @@ macro_rules! int_impl_common {
             }
         };
 
+        #[inline(always)]
         fn from_bool(b: bool) -> Self {
             b as $ty
         }
 
+        #[inline(always)]
         fn logical_shr(self, other: u32) -> Self {
             Self::from_unsigned(self.unsigned().wrapping_shr(other))
         }
 
+        #[inline(always)]
         fn is_zero(self) -> bool {
             self == Self::ZERO
         }
 
+        #[inline(always)]
         fn wrapping_neg(self) -> Self {
             <Self>::wrapping_neg(self)
         }
 
+        #[inline(always)]
         fn wrapping_add(self, other: Self) -> Self {
             <Self>::wrapping_add(self, other)
         }
 
+        #[inline(always)]
         fn wrapping_mul(self, other: Self) -> Self {
             <Self>::wrapping_mul(self, other)
         }
 
+        #[inline(always)]
         fn wrapping_sub(self, other: Self) -> Self {
             <Self>::wrapping_sub(self, other)
         }
 
+        #[inline(always)]
         fn wrapping_shl(self, other: u32) -> Self {
             <Self>::wrapping_shl(self, other)
         }
 
+        #[inline(always)]
         fn wrapping_shr(self, other: u32) -> Self {
             <Self>::wrapping_shr(self, other)
         }
 
+        #[inline(always)]
         fn rotate_left(self, other: u32) -> Self {
             <Self>::rotate_left(self, other)
         }
 
+        #[inline(always)]
         fn overflowing_add(self, other: Self) -> (Self, bool) {
             <Self>::overflowing_add(self, other)
         }
 
+        #[inline(always)]
         fn leading_zeros(self) -> u32 {
             <Self>::leading_zeros(self)
         }
diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs
index 56ce188a..9398daad 100644
--- a/src/int/specialized_div_rem/asymmetric.rs
+++ b/src/int/specialized_div_rem/asymmetric.rs
@@ -14,9 +14,13 @@ macro_rules! impl_asymmetric {
         $uH:ident, // unsigned integer with half the bit width of $uX
         $uX:ident, // unsigned integer with half the bit width of $uD
         $uD:ident // unsigned integer type for the inputs and outputs of `$fn`
+        $(, $fun_attr:meta)* // attributes for the function
     ) => {
         /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
         /// tuple.
+        $(
+            #[$fun_attr]
+        )*
         pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
             let n: u32 = $n_h * 2;
 
diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs
index 330c6e4f..5e82faa4 100644
--- a/src/int/specialized_div_rem/delegate.rs
+++ b/src/int/specialized_div_rem/delegate.rs
@@ -14,9 +14,13 @@ macro_rules! impl_delegate {
         $uX:ident, // unsigned integer with half the bit width of $uD.
         $uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
         $iD:ident // signed integer type with the same bitwidth as `$uD`
+        $(, $fun_attr:meta)* // attributes for the function
     ) => {
         /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
         /// tuple.
+        $(
+            #[$fun_attr]
+        )*
         pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
             // The two possibility algorithm, undersubtracting long division algorithm, or any kind
             // of reciprocal based algorithm will not be fastest, because they involve large
diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs
index 760f5f5b..8c8ac15b 100644
--- a/src/int/specialized_div_rem/mod.rs
+++ b/src/int/specialized_div_rem/mod.rs
@@ -110,7 +110,8 @@ impl_normalization_shift!(
     32,
     u32,
     i32,
-    allow(dead_code)
+    allow(dead_code),
+    inline(always)
 );
 impl_normalization_shift!(
     u64_normalization_shift,
@@ -118,7 +119,8 @@ impl_normalization_shift!(
     64,
     u64,
     i64,
-    allow(dead_code)
+    allow(dead_code),
+    inline(always)
 );
 
 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -149,7 +151,8 @@ impl_trifecta!(
     32,
     u32,
     u64,
-    u128
+    u128,
+    inline(always)
 );
 
 // If the pointer width less than 64, then the target architecture almost certainly does not have
@@ -168,7 +171,8 @@ impl_delegate!(
     u32,
     u64,
     u128,
-    i128
+    i128,
+    inline(always)
 );
 
 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -209,7 +213,8 @@ impl_asymmetric!(
     32,
     u32,
     u64,
-    u128
+    u128,
+    inline(always)
 );
 
 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -255,7 +260,8 @@ impl_binary_long!(
     u64_normalization_shift,
     64,
     u64,
-    i64
+    i64,
+    inline(always)
 );
 
 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -296,7 +302,8 @@ impl_asymmetric!(
     16,
     u16,
     u32,
-    u64
+    u64,
+    inline(always)
 );
 
 // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
@@ -307,5 +314,6 @@ impl_binary_long!(
     32,
     u32,
     i32,
-    allow(dead_code)
+    allow(dead_code),
+    inline(always)
 );
diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs
index 7e104053..ecb0bcd7 100644
--- a/src/int/specialized_div_rem/trifecta.rs
+++ b/src/int/specialized_div_rem/trifecta.rs
@@ -12,9 +12,13 @@ macro_rules! impl_trifecta {
         $uH:ident, // unsigned integer with half the bit width of $uX
         $uX:ident, // unsigned integer with half the bit width of $uD
         $uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name`
+        $(, $fun_attr:meta)* // attributes for the function
     ) => {
         /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
         /// tuple.
+        $(
+            #[$fun_attr]
+        )*
         pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
             // This is called the trifecta algorithm because it uses three main algorithms: short
             // division for small divisors, the two possibility algorithm for large divisors, and an
diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs
index 40b67093..77a45022 100644
--- a/src/mem/x86_64.rs
+++ b/src/mem/x86_64.rs
@@ -304,6 +304,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
 }
 
 /// Determine optimal parameters for a `rep` instruction.
+#[inline(always)]
 fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
     // Unaligned writes are still slow on modern processors, so align the destination address.
     let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);