diff --git a/src/float/conv.rs b/src/float/conv.rs
index 790c0ab9..fe996bbf 100644
--- a/src/float/conv.rs
+++ b/src/float/conv.rs
@@ -5,6 +5,7 @@
///
/// The algorithm is explained here:
mod int_to_float {
+ #[inline(always)]
pub fn u32_to_f32_bits(i: u32) -> u32 {
if i == 0 {
return 0;
@@ -17,6 +18,7 @@ mod int_to_float {
(e << 23) + m // + not |, so the mantissa can overflow into the exponent.
}
+ #[inline(always)]
pub fn u32_to_f64_bits(i: u32) -> u64 {
if i == 0 {
return 0;
@@ -27,6 +29,7 @@ mod int_to_float {
(e << 52) + m // Bit 53 of m will overflow into e.
}
+ #[inline(always)]
pub fn u64_to_f32_bits(i: u64) -> u32 {
let n = i.leading_zeros();
let y = i.wrapping_shl(n);
@@ -37,6 +40,7 @@ mod int_to_float {
(e << 23) + m // + not |, so the mantissa can overflow into the exponent.
}
+ #[inline(always)]
pub fn u64_to_f64_bits(i: u64) -> u64 {
if i == 0 {
return 0;
@@ -49,6 +53,7 @@ mod int_to_float {
(e << 52) + m // + not |, so the mantissa can overflow into the exponent.
}
+ #[inline(always)]
pub fn u128_to_f32_bits(i: u128) -> u32 {
let n = i.leading_zeros();
let y = i.wrapping_shl(n);
@@ -59,6 +64,7 @@ mod int_to_float {
(e << 23) + m // + not |, so the mantissa can overflow into the exponent.
}
+ #[inline(always)]
pub fn u128_to_f64_bits(i: u128) -> u64 {
let n = i.leading_zeros();
let y = i.wrapping_shl(n);
diff --git a/src/float/mod.rs b/src/float/mod.rs
index fdbe9dde..20a70b0f 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -120,12 +120,15 @@ macro_rules! float_impl {
const IMPLICIT_BIT: Self::Int = 1 << Self::SIGNIFICAND_BITS;
const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK);
+ #[inline(always)]
fn repr(self) -> Self::Int {
self.to_bits()
}
+ #[inline(always)]
fn signed_repr(self) -> Self::SignedInt {
self.to_bits() as Self::SignedInt
}
+ #[inline(always)]
fn eq_repr(self, rhs: Self) -> bool {
if self.is_nan() && rhs.is_nan() {
true
@@ -133,21 +136,27 @@ macro_rules! float_impl {
self.repr() == rhs.repr()
}
}
+ #[inline(always)]
fn sign(self) -> bool {
self.signed_repr() < Self::SignedInt::ZERO
}
+ #[inline(always)]
fn exp(self) -> Self::ExpInt {
((self.to_bits() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS) as Self::ExpInt
}
+ #[inline(always)]
fn frac(self) -> Self::Int {
self.to_bits() & Self::SIGNIFICAND_MASK
}
+ #[inline(always)]
fn imp_frac(self) -> Self::Int {
self.frac() | Self::IMPLICIT_BIT
}
+ #[inline(always)]
fn from_repr(a: Self::Int) -> Self {
Self::from_bits(a)
}
+ #[inline(always)]
fn from_parts(sign: bool, exponent: Self::Int, significand: Self::Int) -> Self {
Self::from_repr(
((sign as Self::Int) << (Self::BITS - 1))
@@ -155,6 +164,7 @@ macro_rules! float_impl {
| (significand & Self::SIGNIFICAND_MASK),
)
}
+ #[inline(always)]
fn normalize(significand: Self::Int) -> (i32, Self::Int) {
let shift = significand
.leading_zeros()
@@ -164,6 +174,7 @@ macro_rules! float_impl {
significand << shift as Self::Int,
)
}
+ #[inline(always)]
fn is_subnormal(self) -> bool {
(self.repr() & Self::EXPONENT_MASK) == Self::Int::ZERO
}
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 509f9fda..ee1742b9 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -151,50 +151,62 @@ macro_rules! int_impl_common {
}
};
+ #[inline(always)]
fn from_bool(b: bool) -> Self {
b as $ty
}
+ #[inline(always)]
fn logical_shr(self, other: u32) -> Self {
Self::from_unsigned(self.unsigned().wrapping_shr(other))
}
+ #[inline(always)]
fn is_zero(self) -> bool {
self == Self::ZERO
}
+ #[inline(always)]
fn wrapping_neg(self) -> Self {
::wrapping_neg(self)
}
+ #[inline(always)]
fn wrapping_add(self, other: Self) -> Self {
::wrapping_add(self, other)
}
+ #[inline(always)]
fn wrapping_mul(self, other: Self) -> Self {
::wrapping_mul(self, other)
}
+ #[inline(always)]
fn wrapping_sub(self, other: Self) -> Self {
::wrapping_sub(self, other)
}
+ #[inline(always)]
fn wrapping_shl(self, other: u32) -> Self {
::wrapping_shl(self, other)
}
+ #[inline(always)]
fn wrapping_shr(self, other: u32) -> Self {
::wrapping_shr(self, other)
}
+ #[inline(always)]
fn rotate_left(self, other: u32) -> Self {
::rotate_left(self, other)
}
+ #[inline(always)]
fn overflowing_add(self, other: Self) -> (Self, bool) {
::overflowing_add(self, other)
}
+ #[inline(always)]
fn leading_zeros(self) -> u32 {
::leading_zeros(self)
}
diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs
index 56ce188a..9398daad 100644
--- a/src/int/specialized_div_rem/asymmetric.rs
+++ b/src/int/specialized_div_rem/asymmetric.rs
@@ -14,9 +14,13 @@ macro_rules! impl_asymmetric {
$uH:ident, // unsigned integer with half the bit width of $uX
$uX:ident, // unsigned integer with half the bit width of $uD
$uD:ident // unsigned integer type for the inputs and outputs of `$fn`
+ $(, $fun_attr:meta)* // attributes for the function
) => {
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
+ $(
+ #[$fun_attr]
+ )*
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
let n: u32 = $n_h * 2;
diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs
index 330c6e4f..5e82faa4 100644
--- a/src/int/specialized_div_rem/delegate.rs
+++ b/src/int/specialized_div_rem/delegate.rs
@@ -14,9 +14,13 @@ macro_rules! impl_delegate {
$uX:ident, // unsigned integer with half the bit width of $uD.
$uD:ident, // unsigned integer type for the inputs and outputs of `$fn`
$iD:ident // signed integer type with the same bitwidth as `$uD`
+ $(, $fun_attr:meta)* // attributes for the function
) => {
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
+ $(
+ #[$fun_attr]
+ )*
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
// The two possibility algorithm, undersubtracting long division algorithm, or any kind
// of reciprocal based algorithm will not be fastest, because they involve large
diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs
index 760f5f5b..8c8ac15b 100644
--- a/src/int/specialized_div_rem/mod.rs
+++ b/src/int/specialized_div_rem/mod.rs
@@ -110,7 +110,8 @@ impl_normalization_shift!(
32,
u32,
i32,
- allow(dead_code)
+ allow(dead_code),
+ inline(always)
);
impl_normalization_shift!(
u64_normalization_shift,
@@ -118,7 +119,8 @@ impl_normalization_shift!(
64,
u64,
i64,
- allow(dead_code)
+ allow(dead_code),
+ inline(always)
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -149,7 +151,8 @@ impl_trifecta!(
32,
u32,
u64,
- u128
+ u128,
+ inline(always)
);
// If the pointer width less than 64, then the target architecture almost certainly does not have
@@ -168,7 +171,8 @@ impl_delegate!(
u32,
u64,
u128,
- i128
+ i128,
+ inline(always)
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -209,7 +213,8 @@ impl_asymmetric!(
32,
u32,
u64,
- u128
+ u128,
+ inline(always)
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -255,7 +260,8 @@ impl_binary_long!(
u64_normalization_shift,
64,
u64,
- i64
+ i64,
+ inline(always)
);
/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
@@ -296,7 +302,8 @@ impl_asymmetric!(
16,
u16,
u32,
- u64
+ u64,
+ inline(always)
);
// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
@@ -307,5 +314,6 @@ impl_binary_long!(
32,
u32,
i32,
- allow(dead_code)
+ allow(dead_code),
+ inline(always)
);
diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs
index 7e104053..ecb0bcd7 100644
--- a/src/int/specialized_div_rem/trifecta.rs
+++ b/src/int/specialized_div_rem/trifecta.rs
@@ -12,9 +12,13 @@ macro_rules! impl_trifecta {
$uH:ident, // unsigned integer with half the bit width of $uX
$uX:ident, // unsigned integer with half the bit width of $uD
$uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name`
+ $(, $fun_attr:meta)* // attributes for the function
) => {
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
/// tuple.
+ $(
+ #[$fun_attr]
+ )*
pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) {
// This is called the trifecta algorithm because it uses three main algorithms: short
// division for small divisors, the two possibility algorithm for large divisors, and an
diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs
index 40b67093..77a45022 100644
--- a/src/mem/x86_64.rs
+++ b/src/mem/x86_64.rs
@@ -304,6 +304,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
}
/// Determine optimal parameters for a `rep` instruction.
+#[inline(always)]
fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
// Unaligned writes are still slow on modern processors, so align the destination address.
let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);