1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
//! Defines rounding schemes for floating-point numbers.
#![doc(hidden)]
use crate::extended_float::ExtendedFloat;
use crate::mask::{lower_n_halfway, lower_n_mask};
use crate::num::Float;
// ROUNDING
// --------
/// Round an extended-precision float to the nearest machine float.
///
/// Shifts the significant digits into place, adjusts the exponent,
/// so it can be easily converted to a native float.
#[cfg_attr(not(feature = "compact"), inline)]
pub fn round<F, Cb>(fp: &mut ExtendedFloat, cb: Cb)
where
F: Float,
Cb: Fn(&mut ExtendedFloat, i32),
{
let fp_inf = ExtendedFloat {
mant: 0,
exp: F::INFINITE_POWER,
};
// Calculate our shift in significant digits.
let mantissa_shift = 64 - F::MANTISSA_SIZE - 1;
// Check for a denormal float, if after the shift the exponent is negative.
if -fp.exp >= mantissa_shift {
// Have a denormal float that isn't a literal 0.
// The extra 1 is to adjust for the denormal float, which is
// `1 - F::EXPONENT_BIAS`. This works as before, because our
// old logic rounded to `F::DENORMAL_EXPONENT` (now 1), and then
// checked if `exp == F::DENORMAL_EXPONENT` and no hidden mask
// bit was set. Here, we handle that here, rather than later.
//
// This might round-down to 0, but shift will be at **max** 65,
// for halfway cases rounding towards 0.
let shift = -fp.exp + 1;
debug_assert!(shift <= 65);
cb(fp, shift.min(64));
// Check for round-up: if rounding-nearest carried us to the hidden bit.
fp.exp = (fp.mant >= F::HIDDEN_BIT_MASK) as i32;
return;
}
// The float is normal, round to the hidden bit.
cb(fp, mantissa_shift);
// Check if we carried, and if so, shift the bit to the hidden bit.
let carry_mask = F::CARRY_MASK;
if fp.mant & carry_mask == carry_mask {
fp.mant >>= 1;
fp.exp += 1;
}
// Handle if we carried and check for overflow again.
if fp.exp >= F::INFINITE_POWER {
// Exponent is above largest normal value, must be infinite.
*fp = fp_inf;
return;
}
// Remove the hidden bit.
fp.mant &= F::MANTISSA_MASK;
}
/// Shift right N-bytes and round towards a direction.
///
/// Callback should take the following parameters:
/// 1. is_odd
/// 1. is_halfway
/// 1. is_above
#[cfg_attr(not(feature = "compact"), inline)]
pub fn round_nearest_tie_even<Cb>(fp: &mut ExtendedFloat, shift: i32, cb: Cb)
where
// is_odd, is_halfway, is_above
Cb: Fn(bool, bool, bool) -> bool,
{
// Ensure we've already handled denormal values that underflow.
debug_assert!(shift <= 64);
// Extract the truncated bits using mask.
// Calculate if the value of the truncated bits are either above
// the mid-way point, or equal to it.
//
// For example, for 4 truncated bytes, the mask would be 0b1111
// and the midway point would be 0b1000.
let mask = lower_n_mask(shift as u64);
let halfway = lower_n_halfway(shift as u64);
let truncated_bits = fp.mant & mask;
let is_above = truncated_bits > halfway;
let is_halfway = truncated_bits == halfway;
// Bit shift so the leading bit is in the hidden bit.
// This optimixes pretty well:
// ```text
// mov ecx, esi
// shr rdi, cl
// xor eax, eax
// cmp esi, 64
// cmovne rax, rdi
// ret
// ```
fp.mant = match shift == 64 {
true => 0,
false => fp.mant >> shift,
};
fp.exp += shift;
// Extract the last bit after shifting (and determine if it is odd).
let is_odd = fp.mant & 1 == 1;
// Calculate if we need to roundup.
// We need to roundup if we are above halfway, or if we are odd
// and at half-way (need to tie-to-even). Avoid the branch here.
fp.mant += cb(is_odd, is_halfway, is_above) as u64;
}
/// Round our significant digits into place, truncating them.
#[cfg_attr(not(feature = "compact"), inline)]
pub fn round_down(fp: &mut ExtendedFloat, shift: i32) {
// Might have a shift greater than 64 if we have an error.
fp.mant = match shift == 64 {
true => 0,
false => fp.mant >> shift,
};
fp.exp += shift;
}