diff --git a/src/float/conv.rs b/src/float/conv.rs index 790c0ab9..fe996bbf 100644 --- a/src/float/conv.rs +++ b/src/float/conv.rs @@ -5,6 +5,7 @@ /// /// The algorithm is explained here: mod int_to_float { + #[inline(always)] pub fn u32_to_f32_bits(i: u32) -> u32 { if i == 0 { return 0; @@ -17,6 +18,7 @@ mod int_to_float { (e << 23) + m // + not |, so the mantissa can overflow into the exponent. } + #[inline(always)] pub fn u32_to_f64_bits(i: u32) -> u64 { if i == 0 { return 0; @@ -27,6 +29,7 @@ mod int_to_float { (e << 52) + m // Bit 53 of m will overflow into e. } + #[inline(always)] pub fn u64_to_f32_bits(i: u64) -> u32 { let n = i.leading_zeros(); let y = i.wrapping_shl(n); @@ -37,6 +40,7 @@ mod int_to_float { (e << 23) + m // + not |, so the mantissa can overflow into the exponent. } + #[inline(always)] pub fn u64_to_f64_bits(i: u64) -> u64 { if i == 0 { return 0; @@ -49,6 +53,7 @@ mod int_to_float { (e << 52) + m // + not |, so the mantissa can overflow into the exponent. } + #[inline(always)] pub fn u128_to_f32_bits(i: u128) -> u32 { let n = i.leading_zeros(); let y = i.wrapping_shl(n); @@ -59,6 +64,7 @@ mod int_to_float { (e << 23) + m // + not |, so the mantissa can overflow into the exponent. } + #[inline(always)] pub fn u128_to_f64_bits(i: u128) -> u64 { let n = i.leading_zeros(); let y = i.wrapping_shl(n); diff --git a/src/float/mod.rs b/src/float/mod.rs index fdbe9dde..20a70b0f 100644 --- a/src/float/mod.rs +++ b/src/float/mod.rs @@ -120,12 +120,15 @@ macro_rules! float_impl { const IMPLICIT_BIT: Self::Int = 1 << Self::SIGNIFICAND_BITS; const EXPONENT_MASK: Self::Int = !(Self::SIGN_MASK | Self::SIGNIFICAND_MASK); + #[inline(always)] fn repr(self) -> Self::Int { self.to_bits() } + #[inline(always)] fn signed_repr(self) -> Self::SignedInt { self.to_bits() as Self::SignedInt } + #[inline(always)] fn eq_repr(self, rhs: Self) -> bool { if self.is_nan() && rhs.is_nan() { true @@ -133,21 +136,27 @@ macro_rules! float_impl { self.repr() == rhs.repr() } } + #[inline(always)] fn sign(self) -> bool { self.signed_repr() < Self::SignedInt::ZERO } + #[inline(always)] fn exp(self) -> Self::ExpInt { ((self.to_bits() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS) as Self::ExpInt } + #[inline(always)] fn frac(self) -> Self::Int { self.to_bits() & Self::SIGNIFICAND_MASK } + #[inline(always)] fn imp_frac(self) -> Self::Int { self.frac() | Self::IMPLICIT_BIT } + #[inline(always)] fn from_repr(a: Self::Int) -> Self { Self::from_bits(a) } + #[inline(always)] fn from_parts(sign: bool, exponent: Self::Int, significand: Self::Int) -> Self { Self::from_repr( ((sign as Self::Int) << (Self::BITS - 1)) @@ -155,6 +164,7 @@ macro_rules! float_impl { | (significand & Self::SIGNIFICAND_MASK), ) } + #[inline(always)] fn normalize(significand: Self::Int) -> (i32, Self::Int) { let shift = significand .leading_zeros() @@ -164,6 +174,7 @@ macro_rules! float_impl { significand << shift as Self::Int, ) } + #[inline(always)] fn is_subnormal(self) -> bool { (self.repr() & Self::EXPONENT_MASK) == Self::Int::ZERO } diff --git a/src/int/mod.rs b/src/int/mod.rs index 509f9fda..ee1742b9 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -151,50 +151,62 @@ macro_rules! int_impl_common { } }; + #[inline(always)] fn from_bool(b: bool) -> Self { b as $ty } + #[inline(always)] fn logical_shr(self, other: u32) -> Self { Self::from_unsigned(self.unsigned().wrapping_shr(other)) } + #[inline(always)] fn is_zero(self) -> bool { self == Self::ZERO } + #[inline(always)] fn wrapping_neg(self) -> Self { ::wrapping_neg(self) } + #[inline(always)] fn wrapping_add(self, other: Self) -> Self { ::wrapping_add(self, other) } + #[inline(always)] fn wrapping_mul(self, other: Self) -> Self { ::wrapping_mul(self, other) } + #[inline(always)] fn wrapping_sub(self, other: Self) -> Self { ::wrapping_sub(self, other) } + #[inline(always)] fn wrapping_shl(self, other: u32) -> Self { ::wrapping_shl(self, other) } + #[inline(always)] fn wrapping_shr(self, other: u32) -> Self { ::wrapping_shr(self, other) } + #[inline(always)] fn rotate_left(self, other: u32) -> Self { ::rotate_left(self, other) } + #[inline(always)] fn overflowing_add(self, other: Self) -> (Self, bool) { ::overflowing_add(self, other) } + #[inline(always)] fn leading_zeros(self) -> u32 { ::leading_zeros(self) } diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs index 56ce188a..9398daad 100644 --- a/src/int/specialized_div_rem/asymmetric.rs +++ b/src/int/specialized_div_rem/asymmetric.rs @@ -14,9 +14,13 @@ macro_rules! impl_asymmetric { $uH:ident, // unsigned integer with half the bit width of $uX $uX:ident, // unsigned integer with half the bit width of $uD $uD:ident // unsigned integer type for the inputs and outputs of `$fn` + $(, $fun_attr:meta)* // attributes for the function ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. + $( + #[$fun_attr] + )* pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { let n: u32 = $n_h * 2; diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs index 330c6e4f..5e82faa4 100644 --- a/src/int/specialized_div_rem/delegate.rs +++ b/src/int/specialized_div_rem/delegate.rs @@ -14,9 +14,13 @@ macro_rules! impl_delegate { $uX:ident, // unsigned integer with half the bit width of $uD. $uD:ident, // unsigned integer type for the inputs and outputs of `$fn` $iD:ident // signed integer type with the same bitwidth as `$uD` + $(, $fun_attr:meta)* // attributes for the function ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. + $( + #[$fun_attr] + )* pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { // The two possibility algorithm, undersubtracting long division algorithm, or any kind // of reciprocal based algorithm will not be fastest, because they involve large diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs index 760f5f5b..8c8ac15b 100644 --- a/src/int/specialized_div_rem/mod.rs +++ b/src/int/specialized_div_rem/mod.rs @@ -110,7 +110,8 @@ impl_normalization_shift!( 32, u32, i32, - allow(dead_code) + allow(dead_code), + inline(always) ); impl_normalization_shift!( u64_normalization_shift, @@ -118,7 +119,8 @@ impl_normalization_shift!( 64, u64, i64, - allow(dead_code) + allow(dead_code), + inline(always) ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -149,7 +151,8 @@ impl_trifecta!( 32, u32, u64, - u128 + u128, + inline(always) ); // If the pointer width less than 64, then the target architecture almost certainly does not have @@ -168,7 +171,8 @@ impl_delegate!( u32, u64, u128, - i128 + i128, + inline(always) ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -209,7 +213,8 @@ impl_asymmetric!( 32, u32, u64, - u128 + u128, + inline(always) ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -255,7 +260,8 @@ impl_binary_long!( u64_normalization_shift, 64, u64, - i64 + i64, + inline(always) ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -296,7 +302,8 @@ impl_asymmetric!( 16, u16, u32, - u64 + u64, + inline(always) ); // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division @@ -307,5 +314,6 @@ impl_binary_long!( 32, u32, i32, - allow(dead_code) + allow(dead_code), + inline(always) ); diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs index 7e104053..ecb0bcd7 100644 --- a/src/int/specialized_div_rem/trifecta.rs +++ b/src/int/specialized_div_rem/trifecta.rs @@ -12,9 +12,13 @@ macro_rules! impl_trifecta { $uH:ident, // unsigned integer with half the bit width of $uX $uX:ident, // unsigned integer with half the bit width of $uD $uD:ident // unsigned integer type for the inputs and outputs of `$unsigned_name` + $(, $fun_attr:meta)* // attributes for the function ) => { /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a /// tuple. + $( + #[$fun_attr] + )* pub fn $fn(duo: $uD, div: $uD) -> ($uD, $uD) { // This is called the trifecta algorithm because it uses three main algorithms: short // division for small divisors, the two possibility algorithm for large divisors, and an diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs index 40b67093..77a45022 100644 --- a/src/mem/x86_64.rs +++ b/src/mem/x86_64.rs @@ -304,6 +304,7 @@ pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { } /// Determine optimal parameters for a `rep` instruction. +#[inline(always)] fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { // Unaligned writes are still slow on modern processors, so align the destination address. let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);