wide/
f64x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f64x4 { pub(crate) avx: m256d }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f64x4 { pub(crate) a: f64x2, pub(crate) b: f64x2 }
12  }
13}
14
15macro_rules! const_f64_as_f64x4 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f64x4 = f64x4::new([$f; 4]);
19  };
20}
21
22impl f64x4 {
23  const_f64_as_f64x4!(ONE, 1.0);
24  const_f64_as_f64x4!(ZERO, 0.0);
25  const_f64_as_f64x4!(HALF, 0.5);
26  const_f64_as_f64x4!(E, core::f64::consts::E);
27  const_f64_as_f64x4!(FRAC_1_PI, core::f64::consts::FRAC_1_PI);
28  const_f64_as_f64x4!(FRAC_2_PI, core::f64::consts::FRAC_2_PI);
29  const_f64_as_f64x4!(FRAC_2_SQRT_PI, core::f64::consts::FRAC_2_SQRT_PI);
30  const_f64_as_f64x4!(FRAC_1_SQRT_2, core::f64::consts::FRAC_1_SQRT_2);
31  const_f64_as_f64x4!(FRAC_PI_2, core::f64::consts::FRAC_PI_2);
32  const_f64_as_f64x4!(FRAC_PI_3, core::f64::consts::FRAC_PI_3);
33  const_f64_as_f64x4!(FRAC_PI_4, core::f64::consts::FRAC_PI_4);
34  const_f64_as_f64x4!(FRAC_PI_6, core::f64::consts::FRAC_PI_6);
35  const_f64_as_f64x4!(FRAC_PI_8, core::f64::consts::FRAC_PI_8);
36  const_f64_as_f64x4!(LN_2, core::f64::consts::LN_2);
37  const_f64_as_f64x4!(LN_10, core::f64::consts::LN_10);
38  const_f64_as_f64x4!(LOG2_E, core::f64::consts::LOG2_E);
39  const_f64_as_f64x4!(LOG10_E, core::f64::consts::LOG10_E);
40  const_f64_as_f64x4!(LOG10_2, core::f64::consts::LOG10_2);
41  const_f64_as_f64x4!(LOG2_10, core::f64::consts::LOG2_10);
42  const_f64_as_f64x4!(PI, core::f64::consts::PI);
43  const_f64_as_f64x4!(SQRT_2, core::f64::consts::SQRT_2);
44  const_f64_as_f64x4!(TAU, core::f64::consts::TAU);
45}
46
47unsafe impl Zeroable for f64x4 {}
48unsafe impl Pod for f64x4 {}
49
50impl AlignTo for f64x4 {
51  type Elem = f64;
52}
53
54impl Add for f64x4 {
55  type Output = Self;
56  #[inline]
57  fn add(self, rhs: Self) -> Self::Output {
58    pick! {
59      if #[cfg(target_feature="avx")] {
60        Self { avx: add_m256d(self.avx, rhs.avx) }
61      } else {
62        Self {
63          a : self.a.add(rhs.a),
64          b : self.b.add(rhs.b),
65        }
66      }
67    }
68  }
69}
70
71impl Sub for f64x4 {
72  type Output = Self;
73  #[inline]
74  fn sub(self, rhs: Self) -> Self::Output {
75    pick! {
76      if #[cfg(target_feature="avx")] {
77        Self { avx: sub_m256d(self.avx, rhs.avx) }
78      } else {
79        Self {
80          a : self.a.sub(rhs.a),
81          b : self.b.sub(rhs.b),
82        }
83      }
84    }
85  }
86}
87
88impl Mul for f64x4 {
89  type Output = Self;
90  #[inline]
91  fn mul(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="avx")] {
94        Self { avx: mul_m256d(self.avx, rhs.avx) }
95      } else {
96        Self {
97          a : self.a.mul(rhs.a),
98          b : self.b.mul(rhs.b),
99        }
100      }
101    }
102  }
103}
104
105impl Div for f64x4 {
106  type Output = Self;
107  #[inline]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256d(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Neg for f64x4 {
123  type Output = Self;
124  #[inline]
125  fn neg(self) -> Self::Output {
126    pick! {
127      if #[cfg(target_feature="avx")] {
128        Self { avx: bitxor_m256d(self.avx, Self::splat(-0.0).avx) }
129      } else {
130        Self {
131          a : self.a.neg(),
132          b : self.b.neg(),
133        }
134      }
135    }
136  }
137}
138
139impl Add<f64> for f64x4 {
140  type Output = Self;
141  #[inline]
142  fn add(self, rhs: f64) -> Self::Output {
143    self.add(Self::splat(rhs))
144  }
145}
146
147impl Sub<f64> for f64x4 {
148  type Output = Self;
149  #[inline]
150  fn sub(self, rhs: f64) -> Self::Output {
151    self.sub(Self::splat(rhs))
152  }
153}
154
155impl Mul<f64> for f64x4 {
156  type Output = Self;
157  #[inline]
158  fn mul(self, rhs: f64) -> Self::Output {
159    self.mul(Self::splat(rhs))
160  }
161}
162
163impl Div<f64> for f64x4 {
164  type Output = Self;
165  #[inline]
166  fn div(self, rhs: f64) -> Self::Output {
167    self.div(Self::splat(rhs))
168  }
169}
170
171impl Add<f64x4> for f64 {
172  type Output = f64x4;
173  #[inline]
174  fn add(self, rhs: f64x4) -> Self::Output {
175    f64x4::splat(self).add(rhs)
176  }
177}
178
179impl Sub<f64x4> for f64 {
180  type Output = f64x4;
181  #[inline]
182  fn sub(self, rhs: f64x4) -> Self::Output {
183    f64x4::splat(self).sub(rhs)
184  }
185}
186
187impl Mul<f64x4> for f64 {
188  type Output = f64x4;
189  #[inline]
190  fn mul(self, rhs: f64x4) -> Self::Output {
191    f64x4::splat(self).mul(rhs)
192  }
193}
194
195impl Div<f64x4> for f64 {
196  type Output = f64x4;
197  #[inline]
198  fn div(self, rhs: f64x4) -> Self::Output {
199    f64x4::splat(self).div(rhs)
200  }
201}
202
203impl BitAnd for f64x4 {
204  type Output = Self;
205  #[inline]
206  fn bitand(self, rhs: Self) -> Self::Output {
207    pick! {
208      if #[cfg(target_feature="avx")] {
209        Self { avx: bitand_m256d(self.avx, rhs.avx) }
210      } else {
211        Self {
212          a : self.a.bitand(rhs.a),
213          b : self.b.bitand(rhs.b),
214        }
215      }
216    }
217  }
218}
219
220impl BitOr for f64x4 {
221  type Output = Self;
222  #[inline]
223  fn bitor(self, rhs: Self) -> Self::Output {
224    pick! {
225      if #[cfg(target_feature="avx")] {
226        Self { avx: bitor_m256d(self.avx, rhs.avx) }
227      } else {
228        Self {
229          a : self.a.bitor(rhs.a),
230          b : self.b.bitor(rhs.b),
231        }
232      }
233    }
234  }
235}
236
237impl BitXor for f64x4 {
238  type Output = Self;
239  #[inline]
240  fn bitxor(self, rhs: Self) -> Self::Output {
241    pick! {
242      if #[cfg(target_feature="avx")] {
243        Self { avx: bitxor_m256d(self.avx, rhs.avx) }
244      } else {
245        Self {
246          a : self.a.bitxor(rhs.a),
247          b : self.b.bitxor(rhs.b),
248        }
249      }
250    }
251  }
252}
253
254impl CmpEq for f64x4 {
255  type Output = Self;
256  #[inline]
257  fn simd_eq(self, rhs: Self) -> Self::Output {
258    pick! {
259      if #[cfg(target_feature="avx")]{
260        Self { avx: cmp_op_mask_m256d::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
261      } else {
262        Self {
263          a : self.a.simd_eq(rhs.a),
264          b : self.b.simd_eq(rhs.b),
265        }
266      }
267    }
268  }
269}
270
271impl CmpGe for f64x4 {
272  type Output = Self;
273  #[inline]
274  fn simd_ge(self, rhs: Self) -> Self::Output {
275    pick! {
276      if #[cfg(target_feature="avx")]{
277        Self { avx: cmp_op_mask_m256d::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
278      } else {
279        Self {
280          a : self.a.simd_ge(rhs.a),
281          b : self.b.simd_ge(rhs.b),
282        }
283      }
284    }
285  }
286}
287
288impl CmpGt for f64x4 {
289  type Output = Self;
290  #[inline]
291  fn simd_gt(self, rhs: Self) -> Self::Output {
292    pick! {
293      if #[cfg(target_feature="avx")]{
294        Self { avx: cmp_op_mask_m256d::<{cmp_op!( GreaterThanOrdered)}>(self.avx, rhs.avx) }
295      } else {
296        Self {
297          a : self.a.simd_gt(rhs.a),
298          b : self.b.simd_gt(rhs.b),
299        }
300      }
301    }
302  }
303}
304
305impl CmpNe for f64x4 {
306  type Output = Self;
307  #[inline]
308  fn simd_ne(self, rhs: Self) -> Self::Output {
309    pick! {
310      if #[cfg(target_feature="avx")]{
311        Self { avx: cmp_op_mask_m256d::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
312      } else {
313        Self {
314          a : self.a.simd_ne(rhs.a),
315          b : self.b.simd_ne(rhs.b),
316        }
317      }
318    }
319  }
320}
321
322impl CmpLe for f64x4 {
323  type Output = Self;
324  #[inline]
325  fn simd_le(self, rhs: Self) -> Self::Output {
326    pick! {
327      if #[cfg(target_feature="avx")]{
328        Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
329      } else {
330        Self {
331          a : self.a.simd_le(rhs.a),
332          b : self.b.simd_le(rhs.b),
333        }
334      }
335    }
336  }
337}
338
339impl CmpLt for f64x4 {
340  type Output = Self;
341  #[inline]
342  fn simd_lt(self, rhs: Self) -> Self::Output {
343    pick! {
344      if #[cfg(target_feature="avx")]{
345        Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
346      } else {
347        Self {
348          a : self.a.simd_lt(rhs.a),
349          b : self.b.simd_lt(rhs.b),
350        }
351      }
352    }
353  }
354}
355
356impl f64x4 {
357  #[inline]
358  #[must_use]
359  pub const fn new(array: [f64; 4]) -> Self {
360    unsafe { core::mem::transmute(array) }
361  }
362  #[inline]
363  #[must_use]
364  pub fn blend(self, t: Self, f: Self) -> Self {
365    pick! {
366      if #[cfg(target_feature="avx")] {
367        Self { avx: blend_varying_m256d(f.avx, t.avx, self.avx) }
368      } else {
369        Self {
370          a : self.a.blend(t.a, f.a),
371          b : self.b.blend(t.b, f.b),
372        }
373      }
374    }
375  }
376
377  #[inline]
378  #[must_use]
379  pub fn abs(self) -> Self {
380    pick! {
381      if #[cfg(target_feature="avx")] {
382        let non_sign_bits = f64x4::from(f64::from_bits(i64::MAX as u64));
383        self & non_sign_bits
384      } else {
385        Self {
386          a : self.a.abs(),
387          b : self.b.abs(),
388        }
389      }
390    }
391  }
392
393  #[inline]
394  #[must_use]
395  pub fn floor(self) -> Self {
396    pick! {
397      if #[cfg(target_feature="avx")] {
398        Self { avx: floor_m256d(self.avx) }
399      } else {
400        Self {
401          a : self.a.floor(),
402          b : self.b.floor(),
403        }
404      }
405    }
406  }
407  #[inline]
408  #[must_use]
409  pub fn ceil(self) -> Self {
410    pick! {
411      if #[cfg(target_feature="avx")] {
412        Self { avx: ceil_m256d(self.avx) }
413      } else {
414        Self {
415          a : self.a.ceil(),
416          b : self.b.ceil(),
417        }
418      }
419    }
420  }
421
422  /// Calculates the lanewise maximum of both vectors. This is a faster
423  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
424  /// involved.
425  #[inline]
426  #[must_use]
427  pub fn fast_max(self, rhs: Self) -> Self {
428    pick! {
429      if #[cfg(target_feature="avx")] {
430        Self { avx: max_m256d(self.avx, rhs.avx) }
431      } else {
432        Self {
433          a : self.a.fast_max(rhs.a),
434          b : self.b.fast_max(rhs.b),
435        }
436      }
437    }
438  }
439
440  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
441  /// the other lane gets chosen. Use `fast_max` for a faster implementation
442  /// that doesn't handle NaNs.
443  #[inline]
444  #[must_use]
445  pub fn max(self, rhs: Self) -> Self {
446    pick! {
447      if #[cfg(target_feature="avx")] {
448        // max_m256d seems to do rhs < self ? self : rhs. So if there's any NaN
449        // involved, it chooses rhs, so we need to specifically check rhs for
450        // NaN.
451        rhs.is_nan().blend(self, Self { avx: max_m256d(self.avx, rhs.avx) })
452      } else {
453        Self {
454          a : self.a.max(rhs.a),
455          b : self.b.max(rhs.b),
456        }
457      }
458    }
459  }
460
461  /// Calculates the lanewise minimum of both vectors. This is a faster
462  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
463  /// involved.
464  #[inline]
465  #[must_use]
466  pub fn fast_min(self, rhs: Self) -> Self {
467    pick! {
468      if #[cfg(target_feature="avx")] {
469        Self { avx: min_m256d(self.avx, rhs.avx) }
470      } else {
471        Self {
472          a : self.a.fast_min(rhs.a),
473          b : self.b.fast_min(rhs.b),
474        }
475      }
476    }
477  }
478
479  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
480  /// the other lane gets chosen. Use `fast_min` for a faster implementation
481  /// that doesn't handle NaNs.
482  #[inline]
483  #[must_use]
484  pub fn min(self, rhs: Self) -> Self {
485    pick! {
486      if #[cfg(target_feature="avx")] {
487        // min_m256d seems to do rhs < self ? self : rhs. So if there's any NaN
488        // involved, it chooses rhs, so we need to specifically check rhs for
489        // NaN.
490        rhs.is_nan().blend(self, Self { avx: min_m256d(self.avx, rhs.avx) })
491      } else {
492        Self {
493          a : self.a.min(rhs.a),
494          b : self.b.min(rhs.b),
495        }
496      }
497    }
498  }
499
500  #[inline]
501  #[must_use]
502  pub fn is_nan(self) -> Self {
503    pick! {
504      if #[cfg(target_feature="avx")] {
505        Self { avx: cmp_op_mask_m256d::<{cmp_op!(Unordered)}>(self.avx, self.avx ) }
506      } else {
507        Self {
508          a : self.a.is_nan(),
509          b : self.b.is_nan(),
510        }
511      }
512    }
513  }
514
515  #[inline]
516  #[must_use]
517  pub fn is_finite(self) -> Self {
518    let shifted_exp_mask = u64x4::from(0xFFE0000000000000);
519    let u: u64x4 = cast(self);
520    let shift_u = u << 1_u64;
521    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
522    cast(out)
523  }
524
525  #[inline]
526  #[must_use]
527  pub fn is_inf(self) -> Self {
528    let shifted_inf = u64x4::from(0xFFE0000000000000);
529    let u: u64x4 = cast(self);
530    let shift_u = u << 1_u64;
531    let out = (shift_u).simd_eq(shifted_inf);
532    cast(out)
533  }
534
535  #[inline]
536  #[must_use]
537  pub fn round(self) -> Self {
538    pick! {
539      if #[cfg(target_feature="avx")] {
540        Self { avx: round_m256d::<{round_op!(Nearest)}>(self.avx) }
541      } else {
542        Self {
543          a : self.a.round(),
544          b : self.b.round(),
545        }
546      }
547    }
548  }
549
550  #[inline]
551  #[must_use]
552  pub fn round_int(self) -> i64x4 {
553    // NOTE:No optimization for this currently available so delegate to LLVM
554    let rounded: [f64; 4] = cast(self.round());
555    cast([
556      rounded[0] as i64,
557      rounded[1] as i64,
558      rounded[2] as i64,
559      rounded[3] as i64,
560    ])
561  }
562
563  /// Performs a multiply-add operation: `self * m + a`
564  ///
565  /// When hardware FMA support is available, this computes the result with a
566  /// single rounding operation. Without FMA support, it falls back to separate
567  /// multiply and add operations with two roundings.
568  ///
569  /// # Platform-specific behavior
570  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfmadd` (single rounding,
571  ///   best accuracy)
572  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) + a` (two roundings)
573  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
574  ///
575  /// # Examples
576  /// ```
577  /// # use wide::f64x4;
578  /// let a = f64x4::from([1.0, 2.0, 3.0, 4.0]);
579  /// let b = f64x4::from([2.0; 4]);
580  /// let c = f64x4::from([10.0; 4]);
581  ///
582  /// let result = a.mul_add(b, c);
583  ///
584  /// let expected = f64x4::from([12.0, 14.0, 16.0, 18.0]);
585  /// assert_eq!(result, expected);
586  /// ```
587  #[inline]
588  #[must_use]
589  pub fn mul_add(self, m: Self, a: Self) -> Self {
590    pick! {
591      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
592        Self { avx: fused_mul_add_m256d(self.avx, m.avx, a.avx) }
593      } else if #[cfg(target_feature="avx")] {
594        // still want to use 256 bit ops
595        (self * m) + a
596      } else {
597        Self {
598          a : self.a.mul_add(m.a, a.a),
599          b : self.b.mul_add(m.b, a.b),
600        }
601      }
602    }
603  }
604
605  /// Performs a multiply-subtract operation: `self * m - s`
606  ///
607  /// When hardware FMA support is available, this computes the result with a
608  /// single rounding operation. Without FMA support, it falls back to separate
609  /// multiply and subtract operations with two roundings.
610  ///
611  /// # Platform-specific behavior
612  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfmsub` (single rounding,
613  ///   best accuracy)
614  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) - s` (two roundings)
615  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
616  ///
617  /// # Examples
618  /// ```
619  /// # use wide::f64x4;
620  /// let a = f64x4::from([10.0; 4]);
621  /// let b = f64x4::from([2.0; 4]);
622  /// let c = f64x4::from([5.0; 4]);
623  ///
624  /// let result = a.mul_sub(b, c);
625  ///
626  /// let expected = f64x4::from([15.0; 4]);
627  /// assert_eq!(result, expected);
628  /// ```
629  #[inline]
630  #[must_use]
631  pub fn mul_sub(self, m: Self, s: Self) -> Self {
632    pick! {
633      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
634        Self { avx: fused_mul_sub_m256d(self.avx, m.avx, s.avx) }
635      } else if #[cfg(target_feature="avx")] {
636        // still want to use 256 bit ops
637        (self * m) - s
638      } else {
639        Self {
640          a : self.a.mul_sub(m.a, s.a),
641          b : self.b.mul_sub(m.b, s.b),
642        }
643      }
644    }
645  }
646
647  /// Performs a negative multiply-add operation: `a - (self * m)`
648  ///
649  /// When hardware FMA support is available, this computes the result with a
650  /// single rounding operation. Without FMA support, it falls back to separate
651  /// operations with two roundings.
652  ///
653  /// # Platform-specific behavior
654  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfnmadd` (single rounding,
655  ///   best accuracy)
656  /// - On `x86`/`x86_64` with AVX only: Uses `a - (self * m)` (two roundings)
657  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
658  ///
659  /// # Examples
660  /// ```
661  /// # use wide::f64x4;
662  /// let a = f64x4::from([3.0; 4]);
663  /// let b = f64x4::from([2.0; 4]);
664  /// let c = f64x4::from([10.0; 4]);
665  ///
666  /// let result = a.mul_neg_add(b, c);
667  ///
668  /// let expected = f64x4::from([4.0; 4]);
669  /// assert_eq!(result, expected);
670  /// ```
671  #[inline]
672  #[must_use]
673  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
674    pick! {
675      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
676        Self { avx: fused_mul_neg_add_m256d(self.avx, m.avx, a.avx) }
677      } else if #[cfg(target_feature="avx")] {
678        // still want to use 256 bit ops
679        a - (self * m)
680      } else {
681        Self {
682          a : self.a.mul_neg_add(m.a, a.a),
683          b : self.b.mul_neg_add(m.b, a.b),
684        }
685      }
686    }
687  }
688
689  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
690  ///
691  /// When hardware FMA support is available, this computes the result with a
692  /// single rounding operation. Without FMA support, it falls back to separate
693  /// operations with two roundings.
694  ///
695  /// # Platform-specific behavior
696  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfnmsub` (single rounding,
697  ///   best accuracy)
698  /// - On `x86`/`x86_64` with AVX only: Uses `-(self * m) - s` (two roundings)
699  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
700  ///
701  /// # Examples
702  /// ```
703  /// # use wide::f64x4;
704  /// let a = f64x4::from([3.0; 4]);
705  /// let b = f64x4::from([2.0; 4]);
706  /// let c = f64x4::from([1.0; 4]);
707  ///
708  /// let result = a.mul_neg_sub(b, c);
709  ///
710  /// let expected = f64x4::from([-7.0; 4]);
711  /// assert_eq!(result, expected);
712  /// ```
713  #[inline]
714  #[must_use]
715  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
716    pick! {
717       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
718         Self { avx: fused_mul_neg_sub_m256d(self.avx, m.avx, s.avx) }
719        } else if #[cfg(target_feature="avx")] {
720          // still want to use 256 bit ops
721          -(self * m) - s
722        } else {
723         Self {
724           a : self.a.mul_neg_sub(m.a, s.a),
725           b : self.b.mul_neg_sub(m.b, s.b),
726         }
727       }
728    }
729  }
730
731  #[inline]
732  #[must_use]
733  pub fn flip_signs(self, signs: Self) -> Self {
734    self ^ (signs & Self::from(-0.0))
735  }
736
737  #[inline]
738  #[must_use]
739  pub fn copysign(self, sign: Self) -> Self {
740    let magnitude_mask = Self::from(f64::from_bits(u64::MAX >> 1));
741    (self & magnitude_mask) | (sign & Self::from(-0.0))
742  }
743
744  #[inline]
745  pub fn asin_acos(self) -> (Self, Self) {
746    // Based on the Agner Fog "vector class library":
747    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
748    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
749    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
750    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
751    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
752    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
753
754    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
755    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
756    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
757    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
758
759    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
760    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
761    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
762    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
763    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
764    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
765
766    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
767    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
768    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
769    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
770    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
771
772    let xa = self.abs();
773
774    let big = xa.simd_ge(f64x4::splat(0.625));
775
776    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
777
778    let x2 = x1 * x1;
779    let x3 = x2 * x1;
780    let x4 = x2 * x2;
781    let x5 = x4 * x1;
782
783    let do_big = big.any();
784    let do_small = !big.all();
785
786    let mut rx = f64x4::default();
787    let mut sx = f64x4::default();
788    let mut px = f64x4::default();
789    let mut qx = f64x4::default();
790
791    if do_big {
792      rx = x3.mul_add(R3asin, x2 * R2asin)
793        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
794      sx =
795        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
796    }
797
798    if do_small {
799      px = x3.mul_add(P3asin, P0asin)
800        + x4.mul_add(P4asin, x1 * P1asin)
801        + x5.mul_add(P5asin, x2 * P2asin);
802      qx = x4.mul_add(Q4asin, x5)
803        + x3.mul_add(Q3asin, x1 * Q1asin)
804        + x2.mul_add(Q2asin, Q0asin);
805    };
806
807    let vx = big.blend(rx, px);
808    let wx = big.blend(sx, qx);
809
810    let y1 = vx / wx * x1;
811
812    let mut z1 = f64x4::default();
813    let mut z2 = f64x4::default();
814    if do_big {
815      let xb = (x1 + x1).sqrt();
816      z1 = xb.mul_add(y1, xb);
817    }
818
819    if do_small {
820      z2 = xa.mul_add(y1, xa);
821    }
822
823    // asin
824    let z3 = f64x4::FRAC_PI_2 - z1;
825    let asin = big.blend(z3, z2);
826    let asin = asin.flip_signs(self);
827
828    // acos
829    let z3 = self.simd_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1);
830    let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self);
831    let acos = big.blend(z3, z4);
832
833    (asin, acos)
834  }
835
836  #[inline]
837  pub fn acos(self) -> Self {
838    // Based on the Agner Fog "vector class library":
839    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
840    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
841    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
842    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
843    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
844    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
845
846    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
847    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
848    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
849    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
850
851    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
852    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
853    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
854    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
855    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
856    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
857
858    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
859    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
860    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
861    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
862    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
863
864    let xa = self.abs();
865
866    let big = xa.simd_ge(f64x4::splat(0.625));
867
868    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
869
870    let x2 = x1 * x1;
871    let x3 = x2 * x1;
872    let x4 = x2 * x2;
873    let x5 = x4 * x1;
874
875    let do_big = big.any();
876    let do_small = !big.all();
877
878    let mut rx = f64x4::default();
879    let mut sx = f64x4::default();
880    let mut px = f64x4::default();
881    let mut qx = f64x4::default();
882
883    if do_big {
884      rx = x3.mul_add(R3asin, x2 * R2asin)
885        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
886      sx =
887        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
888    }
889    if do_small {
890      px = x3.mul_add(P3asin, P0asin)
891        + x4.mul_add(P4asin, x1 * P1asin)
892        + x5.mul_add(P5asin, x2 * P2asin);
893      qx = x4.mul_add(Q4asin, x5)
894        + x3.mul_add(Q3asin, x1 * Q1asin)
895        + x2.mul_add(Q2asin, Q0asin);
896    };
897
898    let vx = big.blend(rx, px);
899    let wx = big.blend(sx, qx);
900
901    let y1 = vx / wx * x1;
902
903    let mut z1 = f64x4::default();
904    let mut z2 = f64x4::default();
905    if do_big {
906      let xb = (x1 + x1).sqrt();
907      z1 = xb.mul_add(y1, xb);
908    }
909
910    if do_small {
911      z2 = xa.mul_add(y1, xa);
912    }
913
914    // acos
915    let z3 = self.simd_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1);
916    let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self);
917    let acos = big.blend(z3, z4);
918
919    acos
920  }
921  #[inline]
922  #[must_use]
923  pub fn asin(self) -> Self {
924    // Based on the Agner Fog "vector class library":
925    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
926    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
927    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
928    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
929    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
930    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
931
932    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
933    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
934    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
935    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
936
937    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
938    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
939    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
940    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
941    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
942    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
943
944    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
945    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
946    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
947    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
948    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
949
950    let xa = self.abs();
951
952    let big = xa.simd_ge(f64x4::splat(0.625));
953
954    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
955
956    let x2 = x1 * x1;
957    let x3 = x2 * x1;
958    let x4 = x2 * x2;
959    let x5 = x4 * x1;
960
961    let do_big = big.any();
962    let do_small = !big.all();
963
964    let mut rx = f64x4::default();
965    let mut sx = f64x4::default();
966    let mut px = f64x4::default();
967    let mut qx = f64x4::default();
968
969    if do_big {
970      rx = x3.mul_add(R3asin, x2 * R2asin)
971        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
972      sx =
973        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
974    }
975    if do_small {
976      px = x3.mul_add(P3asin, P0asin)
977        + x4.mul_add(P4asin, x1 * P1asin)
978        + x5.mul_add(P5asin, x2 * P2asin);
979      qx = x4.mul_add(Q4asin, x5)
980        + x3.mul_add(Q3asin, x1 * Q1asin)
981        + x2.mul_add(Q2asin, Q0asin);
982    };
983
984    let vx = big.blend(rx, px);
985    let wx = big.blend(sx, qx);
986
987    let y1 = vx / wx * x1;
988
989    let mut z1 = f64x4::default();
990    let mut z2 = f64x4::default();
991    if do_big {
992      let xb = (x1 + x1).sqrt();
993      z1 = xb.mul_add(y1, xb);
994    }
995
996    if do_small {
997      z2 = xa.mul_add(y1, xa);
998    }
999
1000    // asin
1001    let z3 = f64x4::FRAC_PI_2 - z1;
1002    let asin = big.blend(z3, z2);
1003    let asin = asin.flip_signs(self);
1004
1005    asin
1006  }
1007
1008  #[inline]
1009  pub fn atan(self) -> Self {
1010    // Based on the Agner Fog "vector class library":
1011    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1012    const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17);
1013    const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
1014    const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
1015
1016    const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1);
1017    const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1);
1018    const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1);
1019    const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2);
1020    const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1);
1021
1022    const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1);
1023    const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2);
1024    const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2);
1025    const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2);
1026    const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2);
1027
1028    let t = self.abs();
1029
1030    // small:  t < 0.66
1031    // medium: t <= t <= 2.4142 (1+sqrt(2))
1032    // big:    t > 2.4142
1033    let notbig = t.simd_le(T3PO8);
1034    let notsmal = t.simd_ge(Self::splat(0.66));
1035
1036    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1037    s = notsmal & s;
1038    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1039    fac = notsmal & fac;
1040
1041    // small:  z = t / 1.0;
1042    // medium: z = (t-1.0) / (t+1.0);
1043    // big:    z = -1.0 / t;
1044    let mut a = notbig & t;
1045    a = notsmal.blend(a - Self::ONE, a);
1046    let mut b = notbig & Self::ONE;
1047    b = notsmal.blend(b + t, b);
1048    let z = a / b;
1049
1050    let zz = z * z;
1051
1052    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1053    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1054
1055    let mut re = (px / qx).mul_add(z * zz, z);
1056    re += s + fac;
1057
1058    // get sign bit
1059    re = (self.sign_bit()).blend(-re, re);
1060
1061    re
1062  }
1063
1064  #[inline]
1065  pub fn atan2(self, x: Self) -> Self {
1066    // Based on the Agner Fog "vector class library":
1067    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1068    const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17);
1069    const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
1070    const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
1071
1072    const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1);
1073    const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1);
1074    const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1);
1075    const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2);
1076    const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1);
1077
1078    const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1);
1079    const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2);
1080    const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2);
1081    const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2);
1082    const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2);
1083
1084    let y = self;
1085
1086    // move in first octant
1087    let x1 = x.abs();
1088    let y1 = y.abs();
1089    let swapxy = y1.simd_gt(x1);
1090    // swap x and y if y1 > x1
1091    let mut x2 = swapxy.blend(y1, x1);
1092    let mut y2 = swapxy.blend(x1, y1);
1093
1094    // check for special case: x and y are both +/- INF
1095    let both_infinite = x.is_inf() & y.is_inf();
1096    if both_infinite.any() {
1097      let minus_one = -Self::ONE;
1098      x2 = both_infinite.blend(x2 & minus_one, x2);
1099      y2 = both_infinite.blend(y2 & minus_one, y2);
1100    }
1101
1102    // x = y = 0 gives NAN here
1103    let t = y2 / x2;
1104
1105    // small:  t < 0.66
1106    // medium: t <= t <= 2.4142 (1+sqrt(2))
1107    // big:    t > 2.4142
1108    let notbig = t.simd_le(T3PO8);
1109    let notsmal = t.simd_ge(Self::splat(0.66));
1110
1111    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1112    s = notsmal & s;
1113    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1114    fac = notsmal & fac;
1115
1116    // small:  z = t / 1.0;
1117    // medium: z = (t-1.0) / (t+1.0);
1118    // big:    z = -1.0 / t;
1119    let mut a = notbig & t;
1120    a = notsmal.blend(a - Self::ONE, a);
1121    let mut b = notbig & Self::ONE;
1122    b = notsmal.blend(b + t, b);
1123    let z = a / b;
1124
1125    let zz = z * z;
1126
1127    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1128    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1129
1130    let mut re = (px / qx).mul_add(z * zz, z);
1131    re += s + fac;
1132
1133    // move back in place
1134    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1135    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1136    re = (x.sign_bit()).blend(Self::PI - re, re);
1137
1138    // get sign bit
1139    re = (y.sign_bit()).blend(-re, re);
1140
1141    re
1142  }
1143
1144  #[inline]
1145  #[must_use]
1146  pub fn sin_cos(self) -> (Self, Self) {
1147    // Based on the Agner Fog "vector class library":
1148    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1149
1150    const_f64_as_f64x4!(P0sin, -1.66666666666666307295E-1);
1151    const_f64_as_f64x4!(P1sin, 8.33333333332211858878E-3);
1152    const_f64_as_f64x4!(P2sin, -1.98412698295895385996E-4);
1153    const_f64_as_f64x4!(P3sin, 2.75573136213857245213E-6);
1154    const_f64_as_f64x4!(P4sin, -2.50507477628578072866E-8);
1155    const_f64_as_f64x4!(P5sin, 1.58962301576546568060E-10);
1156
1157    const_f64_as_f64x4!(P0cos, 4.16666666666665929218E-2);
1158    const_f64_as_f64x4!(P1cos, -1.38888888888730564116E-3);
1159    const_f64_as_f64x4!(P2cos, 2.48015872888517045348E-5);
1160    const_f64_as_f64x4!(P3cos, -2.75573141792967388112E-7);
1161    const_f64_as_f64x4!(P4cos, 2.08757008419747316778E-9);
1162    const_f64_as_f64x4!(P5cos, -1.13585365213876817300E-11);
1163
1164    const_f64_as_f64x4!(DP1, 7.853981554508209228515625E-1 * 2.);
1165    const_f64_as_f64x4!(DP2, 7.94662735614792836714E-9 * 2.);
1166    const_f64_as_f64x4!(DP3, 3.06161699786838294307E-17 * 2.);
1167
1168    const_f64_as_f64x4!(TWO_OVER_PI, 2.0 / core::f64::consts::PI);
1169
1170    let xa = self.abs();
1171
1172    let y = (xa * TWO_OVER_PI).round();
1173    let q = y.round_int();
1174
1175    let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa)));
1176
1177    let x2 = x * x;
1178    let mut s = polynomial_5!(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
1179    let mut c = polynomial_5!(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
1180    s = (x * x2).mul_add(s, x);
1181    c =
1182      (x2 * x2).mul_add(c, x2.mul_neg_add(f64x4::from(0.5), f64x4::from(1.0)));
1183
1184    let swap = !((q & i64x4::from(1)).simd_eq(i64x4::from(0)));
1185
1186    let mut overflow: f64x4 = cast(q.simd_gt(i64x4::from(0x80000000000000)));
1187    overflow &= xa.is_finite();
1188    s = overflow.blend(f64x4::from(0.0), s);
1189    c = overflow.blend(f64x4::from(1.0), c);
1190
1191    // calc sin
1192    let mut sin1 = cast::<_, f64x4>(swap).blend(c, s);
1193    let sign_sin: i64x4 = (q << 62) ^ cast::<_, i64x4>(self);
1194    sin1 = sin1.flip_signs(cast(sign_sin));
1195
1196    // calc cos
1197    let mut cos1 = cast::<_, f64x4>(swap).blend(s, c);
1198    let sign_cos: i64x4 = ((q + i64x4::from(1)) & i64x4::from(2)) << 62;
1199    cos1 ^= cast::<_, f64x4>(sign_cos);
1200
1201    (sin1, cos1)
1202  }
1203  #[inline]
1204  #[must_use]
1205  pub fn sin(self) -> Self {
1206    let (s, _) = self.sin_cos();
1207    s
1208  }
1209  #[inline]
1210  #[must_use]
1211  pub fn cos(self) -> Self {
1212    let (_, c) = self.sin_cos();
1213    c
1214  }
1215  #[inline]
1216  #[must_use]
1217  pub fn tan(self) -> Self {
1218    let (s, c) = self.sin_cos();
1219    s / c
1220  }
1221  #[inline]
1222  #[must_use]
1223  pub fn to_degrees(self) -> Self {
1224    const_f64_as_f64x4!(RAD_TO_DEG_RATIO, 180.0_f64 / core::f64::consts::PI);
1225    self * RAD_TO_DEG_RATIO
1226  }
1227  #[inline]
1228  #[must_use]
1229  pub fn to_radians(self) -> Self {
1230    const_f64_as_f64x4!(DEG_TO_RAD_RATIO, core::f64::consts::PI / 180.0_f64);
1231    self * DEG_TO_RAD_RATIO
1232  }
1233  #[inline]
1234  #[must_use]
1235  pub fn sqrt(self) -> Self {
1236    pick! {
1237      if #[cfg(target_feature="avx")] {
1238        Self { avx: sqrt_m256d(self.avx) }
1239      } else {
1240        Self {
1241          a : self.a.sqrt(),
1242          b : self.b.sqrt(),
1243        }
1244      }
1245    }
1246  }
1247  #[inline]
1248  #[must_use]
1249  #[doc(alias("movemask", "move_mask"))]
1250  pub fn to_bitmask(self) -> u32 {
1251    pick! {
1252      if #[cfg(target_feature="avx")] {
1253        move_mask_m256d(self.avx) as u32
1254      } else {
1255        (self.b.to_bitmask() << 2) | self.a.to_bitmask()
1256      }
1257    }
1258  }
1259  #[inline]
1260  #[must_use]
1261  pub fn any(self) -> bool {
1262    pick! {
1263      if #[cfg(target_feature="avx")] {
1264        move_mask_m256d(self.avx) != 0
1265      } else {
1266        self.a.any() || self.b.any()
1267      }
1268    }
1269  }
1270  #[inline]
1271  #[must_use]
1272  pub fn all(self) -> bool {
1273    pick! {
1274      if #[cfg(target_feature="avx")] {
1275        move_mask_m256d(self.avx) == 0b1111
1276      } else {
1277        self.a.all() && self.b.all()
1278      }
1279    }
1280  }
1281  #[inline]
1282  #[must_use]
1283  pub fn none(self) -> bool {
1284    !self.any()
1285  }
1286
1287  #[inline]
1288  fn vm_pow2n(self) -> Self {
1289    const_f64_as_f64x4!(pow2_52, 4503599627370496.0);
1290    const_f64_as_f64x4!(bias, 1023.0);
1291    let a = self + (bias + pow2_52);
1292    let c = cast::<_, i64x4>(a) << 52;
1293    cast::<_, f64x4>(c)
1294  }
1295
1296  /// Calculate the exponent of a packed `f64x4`
1297  #[inline]
1298  #[must_use]
1299  pub fn exp(self) -> Self {
1300    const_f64_as_f64x4!(P2, 1.0 / 2.0);
1301    const_f64_as_f64x4!(P3, 1.0 / 6.0);
1302    const_f64_as_f64x4!(P4, 1. / 24.);
1303    const_f64_as_f64x4!(P5, 1. / 120.);
1304    const_f64_as_f64x4!(P6, 1. / 720.);
1305    const_f64_as_f64x4!(P7, 1. / 5040.);
1306    const_f64_as_f64x4!(P8, 1. / 40320.);
1307    const_f64_as_f64x4!(P9, 1. / 362880.);
1308    const_f64_as_f64x4!(P10, 1. / 3628800.);
1309    const_f64_as_f64x4!(P11, 1. / 39916800.);
1310    const_f64_as_f64x4!(P12, 1. / 479001600.);
1311    const_f64_as_f64x4!(P13, 1. / 6227020800.);
1312    const_f64_as_f64x4!(LN2D_HI, 0.693145751953125);
1313    const_f64_as_f64x4!(LN2D_LO, 1.42860682030941723212E-6);
1314    let max_x = f64x4::from(708.39);
1315    let r = (self * Self::LOG2_E).round();
1316    let x = r.mul_neg_add(LN2D_HI, self);
1317    let x = r.mul_neg_add(LN2D_LO, x);
1318    let z =
1319      polynomial_13!(x, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13);
1320    let n2 = Self::vm_pow2n(r);
1321    let z = (z + Self::ONE) * n2;
1322    // check for overflow
1323    let in_range = self.abs().simd_lt(max_x);
1324    let in_range = in_range & self.is_finite();
1325    in_range.blend(z, Self::ZERO)
1326  }
1327
1328  #[inline]
1329  fn exponent(self) -> f64x4 {
1330    const_f64_as_f64x4!(pow2_52, 4503599627370496.0);
1331    const_f64_as_f64x4!(bias, 1023.0);
1332    let a = cast::<_, u64x4>(self);
1333    let b = a >> 52;
1334    let c = b | cast::<_, u64x4>(pow2_52);
1335    let d = cast::<_, f64x4>(c);
1336    let e = d - (pow2_52 + bias);
1337    e
1338  }
1339
1340  #[inline]
1341  fn fraction_2(self) -> Self {
1342    let t1 = cast::<_, u64x4>(self);
1343    let t2 = cast::<_, u64x4>(
1344      (t1 & u64x4::from(0x000FFFFFFFFFFFFF)) | u64x4::from(0x3FE0000000000000),
1345    );
1346    cast::<_, f64x4>(t2)
1347  }
1348  #[inline]
1349  fn is_zero_or_subnormal(self) -> Self {
1350    let t = cast::<_, i64x4>(self);
1351    let t = t & i64x4::splat(0x7FF0000000000000);
1352    i64x4::round_float(t.simd_eq(i64x4::splat(0)))
1353  }
1354  #[inline]
1355  fn infinity() -> Self {
1356    cast::<_, f64x4>(i64x4::splat(0x7FF0000000000000))
1357  }
1358  #[inline]
1359  fn nan_log() -> Self {
1360    cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29))
1361  }
1362  #[inline]
1363  fn nan_pow() -> Self {
1364    cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29))
1365  }
1366  #[inline]
1367  fn sign_bit(self) -> Self {
1368    let t1 = cast::<_, i64x4>(self);
1369    let t2 = t1 >> 63;
1370    !cast::<_, f64x4>(t2).simd_eq(f64x4::ZERO)
1371  }
1372
1373  /// horizontal add of all the elements of the vector
1374  #[inline]
1375  pub fn reduce_add(self) -> f64 {
1376    pick! {
1377      if #[cfg(target_feature="avx")] {
1378        // From https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
1379        let lo = cast_to_m128d_from_m256d(self.avx);
1380        let hi = extract_m128d_from_m256d::<1>(self.avx);
1381        let lo = add_m128d(lo,hi);
1382        let hi64 = unpack_high_m128d(lo,lo);
1383        let sum = add_m128d_s(lo,hi64);
1384        get_f64_from_m128d_s(sum)
1385      } else {
1386        self.a.reduce_add() + self.b.reduce_add()
1387      }
1388    }
1389  }
1390
1391  /// Natural log (ln(x))
1392  #[inline]
1393  #[must_use]
1394  pub fn ln(self) -> Self {
1395    const_f64_as_f64x4!(HALF, 0.5);
1396    const_f64_as_f64x4!(P0, 7.70838733755885391666E0);
1397    const_f64_as_f64x4!(P1, 1.79368678507819816313E1);
1398    const_f64_as_f64x4!(P2, 1.44989225341610930846E1);
1399    const_f64_as_f64x4!(P3, 4.70579119878881725854E0);
1400    const_f64_as_f64x4!(P4, 4.97494994976747001425E-1);
1401    const_f64_as_f64x4!(P5, 1.01875663804580931796E-4);
1402
1403    const_f64_as_f64x4!(Q0, 2.31251620126765340583E1);
1404    const_f64_as_f64x4!(Q1, 7.11544750618563894466E1);
1405    const_f64_as_f64x4!(Q2, 8.29875266912776603211E1);
1406    const_f64_as_f64x4!(Q3, 4.52279145837532221105E1);
1407    const_f64_as_f64x4!(Q4, 1.12873587189167450590E1);
1408    const_f64_as_f64x4!(LN2F_HI, 0.693359375);
1409    const_f64_as_f64x4!(LN2F_LO, -2.12194440e-4);
1410    const_f64_as_f64x4!(VM_SQRT2, 1.414213562373095048801);
1411    const_f64_as_f64x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1412
1413    let x1 = self;
1414    let x = Self::fraction_2(x1);
1415    let e = Self::exponent(x1);
1416    let mask = x.simd_gt(VM_SQRT2 * HALF);
1417    let x = (!mask).blend(x + x, x);
1418    let fe = mask.blend(e + Self::ONE, e);
1419    let x = x - Self::ONE;
1420    let px = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1421    let x2 = x * x;
1422    let px = x2 * x * px;
1423    let qx = polynomial_5n!(x, Q0, Q1, Q2, Q3, Q4);
1424    let res = px / qx;
1425    let res = fe.mul_add(LN2F_LO, res);
1426    let res = res + x2.mul_neg_add(HALF, x);
1427    let res = fe.mul_add(LN2F_HI, res);
1428    let overflow = !self.is_finite();
1429    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1430    let mask = overflow | underflow;
1431    if !mask.any() {
1432      res
1433    } else {
1434      let is_zero = self.is_zero_or_subnormal();
1435      let res = underflow.blend(Self::nan_log(), res);
1436      let res = is_zero.blend(Self::infinity(), res);
1437      let res = overflow.blend(self, res);
1438      res
1439    }
1440  }
1441
1442  #[inline]
1443  #[must_use]
1444  pub fn log2(self) -> Self {
1445    Self::ln(self) * Self::LOG2_E
1446  }
1447  #[inline]
1448  #[must_use]
1449  pub fn log10(self) -> Self {
1450    Self::ln(self) * Self::LOG10_E
1451  }
1452
1453  #[inline]
1454  #[must_use]
1455  pub fn pow_f64x4(self, y: Self) -> Self {
1456    const_f64_as_f64x4!(ln2d_hi, 0.693145751953125);
1457    const_f64_as_f64x4!(ln2d_lo, 1.42860682030941723212E-6);
1458    const_f64_as_f64x4!(P0log, 2.0039553499201281259648E1);
1459    const_f64_as_f64x4!(P1log, 5.7112963590585538103336E1);
1460    const_f64_as_f64x4!(P2log, 6.0949667980987787057556E1);
1461    const_f64_as_f64x4!(P3log, 2.9911919328553073277375E1);
1462    const_f64_as_f64x4!(P4log, 6.5787325942061044846969E0);
1463    const_f64_as_f64x4!(P5log, 4.9854102823193375972212E-1);
1464    const_f64_as_f64x4!(P6log, 4.5270000862445199635215E-5);
1465    const_f64_as_f64x4!(Q0log, 6.0118660497603843919306E1);
1466    const_f64_as_f64x4!(Q1log, 2.1642788614495947685003E2);
1467    const_f64_as_f64x4!(Q2log, 3.0909872225312059774938E2);
1468    const_f64_as_f64x4!(Q3log, 2.2176239823732856465394E2);
1469    const_f64_as_f64x4!(Q4log, 8.3047565967967209469434E1);
1470    const_f64_as_f64x4!(Q5log, 1.5062909083469192043167E1);
1471
1472    // Taylor expansion constants
1473    const_f64_as_f64x4!(p2, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1474    const_f64_as_f64x4!(p3, 1.0 / 6.0);
1475    const_f64_as_f64x4!(p4, 1.0 / 24.0);
1476    const_f64_as_f64x4!(p5, 1.0 / 120.0);
1477    const_f64_as_f64x4!(p6, 1.0 / 720.0);
1478    const_f64_as_f64x4!(p7, 1.0 / 5040.0);
1479    const_f64_as_f64x4!(p8, 1.0 / 40320.0);
1480    const_f64_as_f64x4!(p9, 1.0 / 362880.0);
1481    const_f64_as_f64x4!(p10, 1.0 / 3628800.0);
1482    const_f64_as_f64x4!(p11, 1.0 / 39916800.0);
1483    const_f64_as_f64x4!(p12, 1.0 / 479001600.0);
1484    const_f64_as_f64x4!(p13, 1.0 / 6227020800.0);
1485
1486    let x1 = self.abs();
1487    let x = x1.fraction_2();
1488    let mask = x.simd_gt(f64x4::SQRT_2 * f64x4::HALF);
1489    let x = (!mask).blend(x + x, x);
1490    let x = x - f64x4::ONE;
1491    let x2 = x * x;
1492    let px = polynomial_6!(x, P0log, P1log, P2log, P3log, P4log, P5log, P6log);
1493    let px = px * x * x2;
1494    let qx = polynomial_6n!(x, Q0log, Q1log, Q2log, Q3log, Q4log, Q5log);
1495    let lg1 = px / qx;
1496
1497    let ef = x1.exponent();
1498    let ef = mask.blend(ef + f64x4::ONE, ef);
1499    let e1 = (ef * y).round();
1500    let yr = ef.mul_sub(y, e1);
1501
1502    let lg = f64x4::HALF.mul_neg_add(x2, x) + lg1;
1503    let x2err = (f64x4::HALF * x).mul_sub(x, f64x4::HALF * x2);
1504    let lg_err = f64x4::HALF.mul_add(x2, lg - x) - lg1;
1505
1506    let e2 = (lg * y * f64x4::LOG2_E).round();
1507    let v = lg.mul_sub(y, e2 * ln2d_hi);
1508    let v = e2.mul_neg_add(ln2d_lo, v);
1509    let v = v - (lg_err + x2err).mul_sub(y, yr * f64x4::LN_2);
1510
1511    let x = v;
1512    let e3 = (x * f64x4::LOG2_E).round();
1513    let x = e3.mul_neg_add(f64x4::LN_2, x);
1514    let z =
1515      polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)
1516        + f64x4::ONE;
1517    let ee = e1 + e2 + e3;
1518    let ei = cast::<_, i64x4>(ee.round_int());
1519    let ej = cast::<_, i64x4>(ei + (cast::<_, i64x4>(z) >> 52));
1520
1521    let overflow = cast::<_, f64x4>(!ej.simd_lt(i64x4::splat(0x07FF)))
1522      | ee.simd_gt(f64x4::splat(3000.0));
1523    let underflow = cast::<_, f64x4>(!ej.simd_gt(i64x4::splat(0x000)))
1524      | ee.simd_lt(f64x4::splat(-3000.0));
1525
1526    // Add exponent by integer addition
1527    let z = cast::<_, f64x4>(cast::<_, i64x4>(z) + (ei << 52));
1528
1529    // Check for overflow/underflow
1530    let z = if (overflow | underflow).any() {
1531      let z = underflow.blend(f64x4::ZERO, z);
1532      overflow.blend(Self::infinity(), z)
1533    } else {
1534      z
1535    };
1536
1537    // Check for self == 0
1538    let x_zero = self.is_zero_or_subnormal();
1539    let z = x_zero.blend(
1540      y.simd_lt(f64x4::ZERO).blend(
1541        Self::infinity(),
1542        y.simd_eq(f64x4::ZERO).blend(f64x4::ONE, f64x4::ZERO),
1543      ),
1544      z,
1545    );
1546
1547    let x_sign = self.sign_bit();
1548
1549    let z = if x_sign.any() {
1550      // Y into an integer
1551      let yi = y.simd_eq(y.round());
1552      // Is y odd?
1553      let y_odd = cast::<_, i64x4>(y.round_int() << 63).round_float();
1554      let z1 =
1555        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1556      x_sign.blend(z1, z)
1557    } else {
1558      z
1559    };
1560
1561    let x_finite = self.is_finite();
1562    let y_finite = y.is_finite();
1563    let e_finite = ee.is_finite();
1564
1565    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1566      return z;
1567    }
1568
1569    (self.is_nan() | y.is_nan()).blend(self + y, z)
1570  }
1571  #[inline]
1572  pub fn powf(self, y: f64) -> Self {
1573    Self::pow_f64x4(self, f64x4::splat(y))
1574  }
1575
1576  #[inline]
1577  pub fn to_array(self) -> [f64; 4] {
1578    cast(self)
1579  }
1580
1581  #[inline]
1582  pub fn as_array(&self) -> &[f64; 4] {
1583    cast_ref(self)
1584  }
1585
1586  #[inline]
1587  pub fn as_mut_array(&mut self) -> &mut [f64; 4] {
1588    cast_mut(self)
1589  }
1590
1591  #[inline]
1592  pub fn from_i32x4(v: i32x4) -> Self {
1593    pick! {
1594      if #[cfg(target_feature="avx")] {
1595        Self { avx: convert_to_m256d_from_i32_m128i(v.sse) }
1596      } else {
1597        Self::new([
1598          v.as_array()[0] as f64,
1599          v.as_array()[1] as f64,
1600          v.as_array()[2] as f64,
1601          v.as_array()[3] as f64,
1602        ])
1603      }
1604    }
1605  }
1606}
1607
1608impl From<i32x4> for f64x4 {
1609  #[inline]
1610  fn from(v: i32x4) -> Self {
1611    Self::from_i32x4(v)
1612  }
1613}
1614
1615impl Not for f64x4 {
1616  type Output = Self;
1617  #[inline]
1618  fn not(self) -> Self {
1619    pick! {
1620      if #[cfg(target_feature="avx")] {
1621        Self { avx: self.avx.not()  }
1622      } else {
1623        Self {
1624          a : self.a.not(),
1625          b : self.b.not(),
1626        }
1627      }
1628    }
1629  }
1630}
wide/f64x4_.rs

wide/
f64x4_.rs