wide/
f64x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx512f")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(64))]
7    pub struct f64x8 { pub(crate) avx512: m512d }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(64))]
11    pub struct f64x8 { pub(crate) a : f64x4, pub(crate) b : f64x4 }
12  }
13}
14
15macro_rules! const_f64_as_f64x8 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f64x8 = f64x8::new([$f; 8]);
19  };
20}
21
22unsafe impl Zeroable for f64x8 {}
23unsafe impl Pod for f64x8 {}
24
25impl AlignTo for f64x8 {
26  type Elem = f64;
27}
28
29impl Add for f64x8 {
30  type Output = Self;
31  #[inline]
32  fn add(self, rhs: Self) -> Self::Output {
33    pick! {
34      if #[cfg(target_feature="avx512f")] {
35        Self { avx512: add_m512d(self.avx512, rhs.avx512) }
36      } else {
37        Self {
38          a : self.a.add(rhs.a),
39          b : self.b.add(rhs.b),
40        }
41      }
42    }
43  }
44}
45
46impl Sub for f64x8 {
47  type Output = Self;
48  #[inline]
49  fn sub(self, rhs: Self) -> Self::Output {
50    pick! {
51      if #[cfg(target_feature="avx512f")] {
52        Self { avx512: sub_m512d(self.avx512, rhs.avx512) }
53      } else {
54        Self {
55          a : self.a.sub(rhs.a),
56          b : self.b.sub(rhs.b),
57        }
58      }
59    }
60  }
61}
62
63impl Mul for f64x8 {
64  type Output = Self;
65  #[inline]
66  fn mul(self, rhs: Self) -> Self::Output {
67    pick! {
68      if #[cfg(target_feature="avx512f")] {
69        Self { avx512: mul_m512d(self.avx512, rhs.avx512) }
70      } else {
71        Self { a: self.a.mul(rhs.a), b: self.b.mul(rhs.b) }
72      }
73    }
74  }
75}
76
77impl Div for f64x8 {
78  type Output = Self;
79  #[inline]
80  fn div(self, rhs: Self) -> Self::Output {
81    pick! {
82      if #[cfg(target_feature="avx512f")] {
83        Self { avx512: div_m512d(self.avx512, rhs.avx512) }
84      } else {
85        Self { a: self.a.div(rhs.a), b: self.b.div(rhs.b) }
86      }
87    }
88  }
89}
90
91impl Neg for f64x8 {
92  type Output = Self;
93  #[inline]
94  fn neg(self) -> Self::Output {
95    pick! {
96      if #[cfg(target_feature="avx512f")] {
97        Self { avx512: bitxor_m512d(self.avx512, Self::splat(-0.0).avx512) }
98      } else {
99        Self {
100          a : self.a.neg(),
101          b : self.b.neg(),
102        }
103      }
104    }
105  }
106}
107
108impl Add<f64> for f64x8 {
109  type Output = Self;
110  #[inline]
111  fn add(self, rhs: f64) -> Self::Output {
112    self.add(Self::splat(rhs))
113  }
114}
115
116impl Sub<f64> for f64x8 {
117  type Output = Self;
118  #[inline]
119  fn sub(self, rhs: f64) -> Self::Output {
120    self.sub(Self::splat(rhs))
121  }
122}
123
124impl Mul<f64> for f64x8 {
125  type Output = Self;
126  #[inline]
127  fn mul(self, rhs: f64) -> Self::Output {
128    self.mul(Self::splat(rhs))
129  }
130}
131
132impl Div<f64> for f64x8 {
133  type Output = Self;
134  #[inline]
135  fn div(self, rhs: f64) -> Self::Output {
136    self.div(Self::splat(rhs))
137  }
138}
139
140impl Add<f64x8> for f64 {
141  type Output = f64x8;
142  #[inline]
143  fn add(self, rhs: f64x8) -> Self::Output {
144    f64x8::splat(self).add(rhs)
145  }
146}
147
148impl Sub<f64x8> for f64 {
149  type Output = f64x8;
150  #[inline]
151  fn sub(self, rhs: f64x8) -> Self::Output {
152    f64x8::splat(self).sub(rhs)
153  }
154}
155
156impl Mul<f64x8> for f64 {
157  type Output = f64x8;
158  #[inline]
159  fn mul(self, rhs: f64x8) -> Self::Output {
160    f64x8::splat(self).mul(rhs)
161  }
162}
163
164impl Div<f64x8> for f64 {
165  type Output = f64x8;
166  #[inline]
167  fn div(self, rhs: f64x8) -> Self::Output {
168    f64x8::splat(self).div(rhs)
169  }
170}
171
172impl BitAnd for f64x8 {
173  type Output = Self;
174  #[inline]
175  fn bitand(self, rhs: Self) -> Self::Output {
176    pick! {
177      if #[cfg(target_feature="avx512f")] {
178        Self { avx512: bitand_m512d(self.avx512, rhs.avx512) }
179      } else {
180        Self {
181          a : self.a.bitand(rhs.a),
182          b : self.b.bitand(rhs.b),
183        }
184      }
185    }
186  }
187}
188
189impl BitOr for f64x8 {
190  type Output = Self;
191  #[inline]
192  fn bitor(self, rhs: Self) -> Self::Output {
193    pick! {
194    if #[cfg(target_feature="avx512f")] {
195        Self { avx512: bitor_m512d(self.avx512, rhs.avx512) }
196      } else {
197        Self {
198          a : self.a.bitor(rhs.a),
199          b : self.b.bitor(rhs.b),
200        }
201      }
202    }
203  }
204}
205
206impl BitXor for f64x8 {
207  type Output = Self;
208  #[inline]
209  fn bitxor(self, rhs: Self) -> Self::Output {
210    pick! {
211      if #[cfg(target_feature="avx512f")] {
212        Self { avx512: bitxor_m512d(self.avx512, rhs.avx512) }
213      } else {
214        Self {
215          a : self.a.bitxor(rhs.a),
216          b : self.b.bitxor(rhs.b),
217        }
218      }
219    }
220  }
221}
222
223impl CmpEq for f64x8 {
224  type Output = Self;
225  #[inline]
226  fn simd_eq(self, rhs: Self) -> Self::Output {
227    pick! {
228      if #[cfg(target_feature="avx512f")] {
229        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(EqualOrdered)}>(self.avx512, rhs.avx512) }
230      } else {
231        Self {
232          a : self.a.simd_eq(rhs.a),
233          b : self.b.simd_eq(rhs.b),
234        }
235      }
236    }
237  }
238}
239
240impl CmpGt for f64x8 {
241  type Output = Self;
242  #[inline]
243  fn simd_gt(self, rhs: Self) -> Self::Output {
244    pick! {
245      if #[cfg(target_feature="avx512f")] {
246        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(GreaterThanOrdered)}>(self.avx512, rhs.avx512) }
247      } else {
248        Self {
249          a : self.a.simd_gt(rhs.a),
250          b : self.b.simd_gt(rhs.b),
251        }
252      }
253    }
254  }
255}
256
257impl CmpGe for f64x8 {
258  type Output = Self;
259  #[inline]
260  fn simd_ge(self, rhs: Self) -> Self::Output {
261    pick! {
262      if #[cfg(target_feature="avx512f")] {
263        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(GreaterEqualOrdered)}>(self.avx512, rhs.avx512) }
264      } else {
265        Self {
266          a : self.a.simd_ge(rhs.a),
267          b : self.b.simd_ge(rhs.b),
268        }
269      }
270    }
271  }
272}
273
274impl CmpLt for f64x8 {
275  type Output = Self;
276  #[inline]
277  fn simd_lt(self, rhs: Self) -> Self::Output {
278    pick! {
279      if #[cfg(target_feature="avx512f")] {
280        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(LessThanOrdered)}>(self.avx512, rhs.avx512) }
281      } else {
282        Self {
283          a : self.a.simd_lt(rhs.a),
284          b : self.b.simd_lt(rhs.b),
285        }
286      }
287    }
288  }
289}
290
291impl CmpLe for f64x8 {
292  type Output = Self;
293  #[inline]
294  fn simd_le(self, rhs: Self) -> Self::Output {
295    pick! {
296      if #[cfg(target_feature="avx512f")] {
297        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(LessEqualOrdered)}>(self.avx512, rhs.avx512) }
298      } else {
299        Self {
300          a : self.a.simd_le(rhs.a),
301          b : self.b.simd_le(rhs.b),
302        }
303      }
304    }
305  }
306}
307
308impl CmpNe for f64x8 {
309  type Output = Self;
310  #[inline]
311  fn simd_ne(self, rhs: Self) -> Self::Output {
312    pick! {
313      if #[cfg(target_feature="avx512f")] {
314        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(NotEqualOrdered)}>(self.avx512, rhs.avx512) }
315      } else {
316        Self {
317          a : self.a.simd_ne(rhs.a),
318          b : self.b.simd_ne(rhs.b),
319        }
320      }
321    }
322  }
323}
324
325impl f64x8 {
326  const_f64_as_f64x8!(ONE, 1.0);
327  const_f64_as_f64x8!(ZERO, 0.0);
328  const_f64_as_f64x8!(HALF, 0.5);
329  const_f64_as_f64x8!(E, core::f64::consts::E);
330  const_f64_as_f64x8!(FRAC_1_PI, core::f64::consts::FRAC_1_PI);
331  const_f64_as_f64x8!(FRAC_2_PI, core::f64::consts::FRAC_2_PI);
332  const_f64_as_f64x8!(FRAC_2_SQRT_PI, core::f64::consts::FRAC_2_SQRT_PI);
333  const_f64_as_f64x8!(FRAC_1_SQRT_2, core::f64::consts::FRAC_1_SQRT_2);
334  const_f64_as_f64x8!(FRAC_PI_2, core::f64::consts::FRAC_PI_2);
335  const_f64_as_f64x8!(FRAC_PI_3, core::f64::consts::FRAC_PI_3);
336  const_f64_as_f64x8!(FRAC_PI_4, core::f64::consts::FRAC_PI_4);
337  const_f64_as_f64x8!(FRAC_PI_6, core::f64::consts::FRAC_PI_6);
338  const_f64_as_f64x8!(FRAC_PI_8, core::f64::consts::FRAC_PI_8);
339  const_f64_as_f64x8!(LN_2, core::f64::consts::LN_2);
340  const_f64_as_f64x8!(LN_10, core::f64::consts::LN_10);
341  const_f64_as_f64x8!(LOG2_E, core::f64::consts::LOG2_E);
342  const_f64_as_f64x8!(LOG10_E, core::f64::consts::LOG10_E);
343  const_f64_as_f64x8!(LOG10_2, core::f64::consts::LOG10_2);
344  const_f64_as_f64x8!(LOG2_10, core::f64::consts::LOG2_10);
345  const_f64_as_f64x8!(PI, core::f64::consts::PI);
346  const_f64_as_f64x8!(SQRT_2, core::f64::consts::SQRT_2);
347  const_f64_as_f64x8!(TAU, core::f64::consts::TAU);
348
349  #[inline]
350  #[must_use]
351  pub const fn new(array: [f64; 8]) -> Self {
352    unsafe { core::mem::transmute(array) }
353  }
354
355  #[inline]
356  #[must_use]
357  pub fn blend(self, t: Self, f: Self) -> Self {
358    pick! {
359      if #[cfg(target_feature="avx512f")] {
360        Self { avx512: blend_varying_m512d(f.avx512, t.avx512, movepi64_mask_m512d(self.avx512)) }
361      } else {
362        Self {
363          a : self.a.blend(t.a, f.a),
364          b : self.b.blend(t.b, f.b),
365        }
366      }
367    }
368  }
369
370  #[inline]
371  #[must_use]
372  pub fn abs(self) -> Self {
373    pick! {
374      if #[cfg(target_feature="avx512f")] {
375        let non_sign_bits = f64x8::from(f64::from_bits(i64::MAX as u64));
376        self & non_sign_bits
377      } else {
378        Self {
379          a: self.a.abs(),
380          b: self.b.abs(),
381        }
382      }
383    }
384  }
385
386  #[inline]
387  #[must_use]
388  pub fn floor(self) -> Self {
389    pick! {
390      if #[cfg(target_feature="avx512f")] {
391        Self { avx512: round_m512d::<{round_op!(NegInf)}>(self.avx512) }
392      } else {
393        Self {
394          a : self.a.floor(),
395          b : self.b.floor(),
396        }
397      }
398    }
399  }
400  #[inline]
401  #[must_use]
402  pub fn ceil(self) -> Self {
403    pick! {
404      if #[cfg(target_feature="avx512f")] {
405        Self { avx512: round_m512d::<{round_op!(PosInf)}>(self.avx512) }
406      } else {
407        Self {
408          a : self.a.ceil(),
409          b : self.b.ceil(),
410        }
411      }
412    }
413  }
414
415  #[inline]
416  #[must_use]
417  pub fn fast_max(self, rhs: Self) -> Self {
418    pick! {
419      if #[cfg(target_feature="avx512f")] {
420        Self { avx512: max_m512d(self.avx512, rhs.avx512) }
421      } else {
422        Self {
423          a : self.a.fast_max(rhs.a),
424          b : self.b.fast_max(rhs.b),
425        }
426      }
427    }
428  }
429
430  #[inline]
431  #[must_use]
432  pub fn max(self, rhs: Self) -> Self {
433    pick! {
434      if #[cfg(target_feature="avx512f")] {
435        rhs.is_nan().blend(self, Self { avx512: max_m512d(self.avx512, rhs.avx512) })
436      } else {
437        Self {
438          a: self.a.max(rhs.a),
439          b: self.b.max(rhs.b),
440        }
441      }
442    }
443  }
444
445  #[inline]
446  #[must_use]
447  pub fn fast_min(self, rhs: Self) -> Self {
448    pick! {
449      if #[cfg(target_feature="avx512f")] {
450        Self { avx512: min_m512d(self.avx512, rhs.avx512) }
451      } else {
452        Self {
453          a : self.a.fast_min(rhs.a),
454          b : self.b.fast_min(rhs.b),
455        }
456      }
457    }
458  }
459
460  #[inline]
461  #[must_use]
462  pub fn min(self, rhs: Self) -> Self {
463    pick! {
464      if #[cfg(target_feature="avx512f")] {
465        rhs.is_nan().blend(self, Self { avx512: min_m512d(self.avx512, rhs.avx512) })
466      } else {
467        Self {
468          a: self.a.min(rhs.a),
469          b: self.b.min(rhs.b),
470        }
471      }
472    }
473  }
474
475  #[inline]
476  #[must_use]
477  pub fn is_nan(self) -> Self {
478    pick! {
479      if #[cfg(target_feature="avx512f")] {
480        Self { avx512: cmp_op_mask_m512d::<{cmp_op!(Unordered)}>(self.avx512, self.avx512) }
481      } else {
482        Self {
483          a: self.a.is_nan(),
484          b: self.b.is_nan(),
485        }
486      }
487    }
488  }
489
490  #[inline]
491  #[must_use]
492  pub fn is_finite(self) -> Self {
493    let shifted_exp_mask = u64x8::splat(0xFFE0000000000000);
494    let u: u64x8 = cast(self);
495    let shift_u = u << 1_u64;
496    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
497    cast(out)
498  }
499
500  #[inline]
501  #[must_use]
502  pub fn is_inf(self) -> Self {
503    let shifted_inf = u64x8::from(0xFFE0000000000000);
504    let u: u64x8 = cast(self);
505    let shift_u = u << 1_u64;
506    let out = (shift_u).simd_eq(shifted_inf);
507    cast(out)
508  }
509
510  #[inline]
511  #[must_use]
512  pub fn round(self) -> Self {
513    pick! {
514      if #[cfg(target_feature="avx512f")] {
515        Self { avx512: round_m512d::<{round_op!(Nearest)}>(self.avx512) }
516      } else {
517        Self {
518          a: self.a.round(),
519          b: self.b.round(),
520        }
521      }
522    }
523  }
524
525  #[inline]
526  #[must_use]
527  pub fn round_int(self) -> i64x8 {
528    let rounded: [f64; 8] = cast(self.round());
529    cast([
530      rounded[0] as i64,
531      rounded[1] as i64,
532      rounded[2] as i64,
533      rounded[3] as i64,
534      rounded[4] as i64,
535      rounded[5] as i64,
536      rounded[6] as i64,
537      rounded[7] as i64,
538    ])
539  }
540
541  /// Performs a multiply-add operation: `self * m + a`
542  ///
543  /// When hardware FMA support is available, this computes the result with a
544  /// single rounding operation. Without FMA support, it falls back to separate
545  /// multiply and add operations with two roundings.
546  ///
547  /// # Platform-specific behavior
548  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfmadd` (single
549  ///   rounding, best accuracy)
550  /// - On `x86`/`x86_64` with AVX-512F only: Uses `(self * m) + a` (two
551  ///   roundings)
552  /// - Other platforms: Delegates to [`f64x4`] (inherits its FMA behavior)
553  ///
554  /// # Examples
555  /// ```
556  /// # use wide::f64x8;
557  /// let a = f64x8::from([1.0; 8]);
558  /// let b = f64x8::from([2.0; 8]);
559  /// let c = f64x8::from([10.0; 8]);
560  ///
561  /// let result = a.mul_add(b, c);
562  ///
563  /// let expected = f64x8::from([12.0; 8]);
564  /// assert_eq!(result, expected);
565  /// ```
566  #[inline]
567  #[must_use]
568  pub fn mul_add(self, m: Self, a: Self) -> Self {
569    pick! {
570      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
571        Self { avx512: fused_mul_add_m512d(self.avx512, m.avx512, a.avx512) }
572      } else if #[cfg(target_feature="avx512f")] {
573        // still want to use 512 bit ops
574        (self * m) + a
575      } else {
576        Self {
577          a : self.a.mul_add(m.a, a.a),
578          b : self.b.mul_add(m.b, a.b),
579        }
580      }
581    }
582  }
583
584  /// Performs a multiply-subtract operation: `self * m - s`
585  ///
586  /// When hardware FMA support is available, this computes the result with a
587  /// single rounding operation. Without FMA support, it falls back to separate
588  /// multiply and subtract operations with two roundings.
589  ///
590  /// # Platform-specific behavior
591  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfmsub` (single
592  ///   rounding, best accuracy)
593  /// - On `x86`/`x86_64` with AVX-512F only: Uses `(self * m) - s` (two
594  ///   roundings)
595  /// - Other platforms: Delegates to [`f64x4`] (inherits its FMA behavior)
596  ///
597  /// # Examples
598  /// ```
599  /// # use wide::f64x8;
600  /// let a = f64x8::from([10.0; 8]);
601  /// let b = f64x8::from([3.0; 8]);
602  /// let c = f64x8::from([5.0; 8]);
603  ///
604  /// let result = a.mul_sub(b, c);
605  ///
606  /// let expected = f64x8::from([25.0; 8]);
607  /// assert_eq!(result, expected);
608  /// ```
609  #[inline]
610  #[must_use]
611  pub fn mul_sub(self, m: Self, s: Self) -> Self {
612    pick! {
613      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
614        Self { avx512: fused_mul_sub_m512d(self.avx512, m.avx512, s.avx512) }
615      } else if #[cfg(target_feature="avx512f")] {
616        // still want to use 512 bit ops
617        (self * m) - s
618      } else {
619        Self {
620          a : self.a.mul_sub(m.a, s.a),
621          b : self.b.mul_sub(m.b, s.b),
622        }
623      }
624    }
625  }
626
627  /// Performs a negative multiply-add operation: `a - (self * m)`
628  ///
629  /// When hardware FMA support is available, this computes the result with a
630  /// single rounding operation. Without FMA support, it falls back to separate
631  /// operations with two roundings.
632  ///
633  /// # Platform-specific behavior
634  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfnmadd` (single
635  ///   rounding, best accuracy)
636  /// - On `x86`/`x86_64` with AVX-512F only: Uses `a - (self * m)` (two
637  ///   roundings)
638  /// - Other platforms: Delegates to [`f64x4`] (inherits its FMA behavior)
639  ///
640  /// # Examples
641  /// ```
642  /// # use wide::f64x8;
643  /// let a = f64x8::from([4.0; 8]);
644  /// let b = f64x8::from([2.0; 8]);
645  /// let c = f64x8::from([10.0; 8]);
646  ///
647  /// let result = a.mul_neg_add(b, c);
648  ///
649  /// let expected = f64x8::from([2.0; 8]);
650  /// assert_eq!(result, expected);
651  /// ```
652  #[inline]
653  #[must_use]
654  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
655    pick! {
656      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
657        Self { avx512: fused_mul_neg_add_m512d(self.avx512, m.avx512, a.avx512) }
658      } else if #[cfg(target_feature="avx512f")] {
659        // still want to use 512 bit ops
660        a - (self * m)
661      } else {
662        Self {
663          a : self.a.mul_neg_add(m.a, a.a),
664          b : self.b.mul_neg_add(m.b, a.b),
665        }
666      }
667    }
668  }
669
670  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
671  ///
672  /// When hardware FMA support is available, this computes the result with a
673  /// single rounding operation. Without FMA support, it falls back to separate
674  /// operations with two roundings.
675  ///
676  /// # Platform-specific behavior
677  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfnmsub` (single
678  ///   rounding, best accuracy)
679  /// - On `x86`/`x86_64` with AVX-512F only: Uses `-(self * m) - s` (two
680  ///   roundings)
681  /// - Other platforms: Delegates to [`f64x4`] (inherits its FMA behavior)
682  ///
683  /// # Examples
684  /// ```
685  /// # use wide::f64x8;
686  /// let a = f64x8::from([4.0; 8]);
687  /// let b = f64x8::from([2.0; 8]);
688  /// let c = f64x8::from([1.0; 8]);
689  ///
690  /// let result = a.mul_neg_sub(b, c);
691  ///
692  /// let expected = f64x8::from([-9.0; 8]);
693  /// assert_eq!(result, expected);
694  /// ```
695  #[inline]
696  #[must_use]
697  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
698    pick! {
699       if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
700         Self { avx512: fused_mul_neg_sub_m512d(self.avx512, m.avx512, s.avx512) }
701        } else if #[cfg(target_feature="avx512f")] {
702          // still want to use 512 bit ops
703          -(self * m) - s
704        } else {
705         Self {
706           a : self.a.mul_neg_sub(m.a, s.a),
707           b : self.b.mul_neg_sub(m.b, s.b),
708         }
709       }
710    }
711  }
712
713  #[inline]
714  #[must_use]
715  pub fn flip_signs(self, signs: Self) -> Self {
716    self ^ (signs & Self::from(-0.0))
717  }
718
719  #[inline]
720  #[must_use]
721  pub fn copysign(self, sign: Self) -> Self {
722    let magnitude_mask = Self::from(f64::from_bits(u64::MAX >> 1));
723    (self & magnitude_mask) | (sign & Self::from(-0.0))
724  }
725
726  #[inline]
727  pub fn asin_acos(self) -> (Self, Self) {
728    // Based on the Agner Fog "vector class library":
729    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
730    const_f64_as_f64x8!(R4asin, 2.967721961301243206100E-3);
731    const_f64_as_f64x8!(R3asin, -5.634242780008963776856E-1);
732    const_f64_as_f64x8!(R2asin, 6.968710824104713396794E0);
733    const_f64_as_f64x8!(R1asin, -2.556901049652824852289E1);
734    const_f64_as_f64x8!(R0asin, 2.853665548261061424989E1);
735
736    const_f64_as_f64x8!(S3asin, -2.194779531642920639778E1);
737    const_f64_as_f64x8!(S2asin, 1.470656354026814941758E2);
738    const_f64_as_f64x8!(S1asin, -3.838770957603691357202E2);
739    const_f64_as_f64x8!(S0asin, 3.424398657913078477438E2);
740
741    const_f64_as_f64x8!(P5asin, 4.253011369004428248960E-3);
742    const_f64_as_f64x8!(P4asin, -6.019598008014123785661E-1);
743    const_f64_as_f64x8!(P3asin, 5.444622390564711410273E0);
744    const_f64_as_f64x8!(P2asin, -1.626247967210700244449E1);
745    const_f64_as_f64x8!(P1asin, 1.956261983317594739197E1);
746    const_f64_as_f64x8!(P0asin, -8.198089802484824371615E0);
747
748    const_f64_as_f64x8!(Q4asin, -1.474091372988853791896E1);
749    const_f64_as_f64x8!(Q3asin, 7.049610280856842141659E1);
750    const_f64_as_f64x8!(Q2asin, -1.471791292232726029859E2);
751    const_f64_as_f64x8!(Q1asin, 1.395105614657485689735E2);
752    const_f64_as_f64x8!(Q0asin, -4.918853881490881290097E1);
753
754    let xa = self.abs();
755
756    let big = xa.simd_ge(f64x8::splat(0.625));
757
758    let x1 = big.blend(f64x8::splat(1.0) - xa, xa * xa);
759
760    let x2 = x1 * x1;
761    let x3 = x2 * x1;
762    let x4 = x2 * x2;
763    let x5 = x4 * x1;
764
765    let do_big = big.any();
766    let do_small = !big.all();
767
768    let mut rx = f64x8::default();
769    let mut sx = f64x8::default();
770    let mut px = f64x8::default();
771    let mut qx = f64x8::default();
772
773    if do_big {
774      rx = x3.mul_add(R3asin, x2 * R2asin)
775        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
776      sx =
777        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
778    }
779
780    if do_small {
781      px = x3.mul_add(P3asin, P0asin)
782        + x4.mul_add(P4asin, x1 * P1asin)
783        + x5.mul_add(P5asin, x2 * P2asin);
784      qx = x4.mul_add(Q4asin, x5)
785        + x3.mul_add(Q3asin, x1 * Q1asin)
786        + x2.mul_add(Q2asin, Q0asin);
787    };
788
789    let vx = big.blend(rx, px);
790    let wx = big.blend(sx, qx);
791
792    let y1 = vx / wx * x1;
793
794    let mut z1 = f64x8::default();
795    let mut z2 = f64x8::default();
796    if do_big {
797      let xb = (x1 + x1).sqrt();
798      z1 = xb.mul_add(y1, xb);
799    }
800
801    if do_small {
802      z2 = xa.mul_add(y1, xa);
803    }
804
805    // asin
806    let z3 = f64x8::FRAC_PI_2 - z1;
807    let asin = big.blend(z3, z2);
808    let asin = asin.flip_signs(self);
809
810    // acos
811    let z3 = self.simd_lt(f64x8::ZERO).blend(f64x8::PI - z1, z1);
812    let z4 = f64x8::FRAC_PI_2 - z2.flip_signs(self);
813    let acos = big.blend(z3, z4);
814
815    (asin, acos)
816  }
817
818  #[inline]
819  pub fn acos(self) -> Self {
820    // Based on the Agner Fog "vector class library":
821    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
822    const_f64_as_f64x8!(R4asin, 2.967721961301243206100E-3);
823    const_f64_as_f64x8!(R3asin, -5.634242780008963776856E-1);
824    const_f64_as_f64x8!(R2asin, 6.968710824104713396794E0);
825    const_f64_as_f64x8!(R1asin, -2.556901049652824852289E1);
826    const_f64_as_f64x8!(R0asin, 2.853665548261061424989E1);
827
828    const_f64_as_f64x8!(S3asin, -2.194779531642920639778E1);
829    const_f64_as_f64x8!(S2asin, 1.470656354026814941758E2);
830    const_f64_as_f64x8!(S1asin, -3.838770957603691357202E2);
831    const_f64_as_f64x8!(S0asin, 3.424398657913078477438E2);
832
833    const_f64_as_f64x8!(P5asin, 4.253011369004428248960E-3);
834    const_f64_as_f64x8!(P4asin, -6.019598008014123785661E-1);
835    const_f64_as_f64x8!(P3asin, 5.444622390564711410273E0);
836    const_f64_as_f64x8!(P2asin, -1.626247967210700244449E1);
837    const_f64_as_f64x8!(P1asin, 1.956261983317594739197E1);
838    const_f64_as_f64x8!(P0asin, -8.198089802484824371615E0);
839
840    const_f64_as_f64x8!(Q4asin, -1.474091372988853791896E1);
841    const_f64_as_f64x8!(Q3asin, 7.049610280856842141659E1);
842    const_f64_as_f64x8!(Q2asin, -1.471791292232726029859E2);
843    const_f64_as_f64x8!(Q1asin, 1.395105614657485689735E2);
844    const_f64_as_f64x8!(Q0asin, -4.918853881490881290097E1);
845
846    let xa = self.abs();
847
848    let big = xa.simd_ge(f64x8::splat(0.625));
849
850    let x1 = big.blend(f64x8::splat(1.0) - xa, xa * xa);
851
852    let x2 = x1 * x1;
853    let x3 = x2 * x1;
854    let x4 = x2 * x2;
855    let x5 = x4 * x1;
856
857    let do_big = big.any();
858    let do_small = !big.all();
859
860    let mut rx = f64x8::default();
861    let mut sx = f64x8::default();
862    let mut px = f64x8::default();
863    let mut qx = f64x8::default();
864
865    if do_big {
866      rx = x3.mul_add(R3asin, x2 * R2asin)
867        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
868      sx =
869        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
870    }
871    if do_small {
872      px = x3.mul_add(P3asin, P0asin)
873        + x4.mul_add(P4asin, x1 * P1asin)
874        + x5.mul_add(P5asin, x2 * P2asin);
875      qx = x4.mul_add(Q4asin, x5)
876        + x3.mul_add(Q3asin, x1 * Q1asin)
877        + x2.mul_add(Q2asin, Q0asin);
878    };
879
880    let vx = big.blend(rx, px);
881    let wx = big.blend(sx, qx);
882
883    let y1 = vx / wx * x1;
884
885    let mut z1 = f64x8::default();
886    let mut z2 = f64x8::default();
887    if do_big {
888      let xb = (x1 + x1).sqrt();
889      z1 = xb.mul_add(y1, xb);
890    }
891
892    if do_small {
893      z2 = xa.mul_add(y1, xa);
894    }
895
896    // acos
897    let z3 = self.simd_lt(f64x8::ZERO).blend(f64x8::PI - z1, z1);
898    let z4 = f64x8::FRAC_PI_2 - z2.flip_signs(self);
899    let acos = big.blend(z3, z4);
900
901    acos
902  }
903  #[inline]
904  #[must_use]
905  pub fn asin(self) -> Self {
906    // Based on the Agner Fog "vector class library":
907    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
908    const_f64_as_f64x8!(R4asin, 2.967721961301243206100E-3);
909    const_f64_as_f64x8!(R3asin, -5.634242780008963776856E-1);
910    const_f64_as_f64x8!(R2asin, 6.968710824104713396794E0);
911    const_f64_as_f64x8!(R1asin, -2.556901049652824852289E1);
912    const_f64_as_f64x8!(R0asin, 2.853665548261061424989E1);
913
914    const_f64_as_f64x8!(S3asin, -2.194779531642920639778E1);
915    const_f64_as_f64x8!(S2asin, 1.470656354026814941758E2);
916    const_f64_as_f64x8!(S1asin, -3.838770957603691357202E2);
917    const_f64_as_f64x8!(S0asin, 3.424398657913078477438E2);
918
919    const_f64_as_f64x8!(P5asin, 4.253011369004428248960E-3);
920    const_f64_as_f64x8!(P4asin, -6.019598008014123785661E-1);
921    const_f64_as_f64x8!(P3asin, 5.444622390564711410273E0);
922    const_f64_as_f64x8!(P2asin, -1.626247967210700244449E1);
923    const_f64_as_f64x8!(P1asin, 1.956261983317594739197E1);
924    const_f64_as_f64x8!(P0asin, -8.198089802484824371615E0);
925
926    const_f64_as_f64x8!(Q4asin, -1.474091372988853791896E1);
927    const_f64_as_f64x8!(Q3asin, 7.049610280856842141659E1);
928    const_f64_as_f64x8!(Q2asin, -1.471791292232726029859E2);
929    const_f64_as_f64x8!(Q1asin, 1.395105614657485689735E2);
930    const_f64_as_f64x8!(Q0asin, -4.918853881490881290097E1);
931
932    let xa = self.abs();
933
934    let big = xa.simd_ge(f64x8::splat(0.625));
935
936    let x1 = big.blend(f64x8::splat(1.0) - xa, xa * xa);
937
938    let x2 = x1 * x1;
939    let x3 = x2 * x1;
940    let x4 = x2 * x2;
941    let x5 = x4 * x1;
942
943    let do_big = big.any();
944    let do_small = !big.all();
945
946    let mut rx = f64x8::default();
947    let mut sx = f64x8::default();
948    let mut px = f64x8::default();
949    let mut qx = f64x8::default();
950
951    if do_big {
952      rx = x3.mul_add(R3asin, x2 * R2asin)
953        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
954      sx =
955        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
956    }
957    if do_small {
958      px = x3.mul_add(P3asin, P0asin)
959        + x4.mul_add(P4asin, x1 * P1asin)
960        + x5.mul_add(P5asin, x2 * P2asin);
961      qx = x4.mul_add(Q4asin, x5)
962        + x3.mul_add(Q3asin, x1 * Q1asin)
963        + x2.mul_add(Q2asin, Q0asin);
964    };
965
966    let vx = big.blend(rx, px);
967    let wx = big.blend(sx, qx);
968
969    let y1 = vx / wx * x1;
970
971    let mut z1 = f64x8::default();
972    let mut z2 = f64x8::default();
973    if do_big {
974      let xb = (x1 + x1).sqrt();
975      z1 = xb.mul_add(y1, xb);
976    }
977
978    if do_small {
979      z2 = xa.mul_add(y1, xa);
980    }
981
982    // asin
983    let z3 = f64x8::FRAC_PI_2 - z1;
984    let asin = big.blend(z3, z2);
985    let asin = asin.flip_signs(self);
986
987    asin
988  }
989
990  #[inline]
991  pub fn atan(self) -> Self {
992    // Based on the Agner Fog "vector class library":
993    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
994    const_f64_as_f64x8!(MORE_BITS, 6.123233995736765886130E-17);
995    const_f64_as_f64x8!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
996    const_f64_as_f64x8!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
997
998    const_f64_as_f64x8!(P4atan, -8.750608600031904122785E-1);
999    const_f64_as_f64x8!(P3atan, -1.615753718733365076637E1);
1000    const_f64_as_f64x8!(P2atan, -7.500855792314704667340E1);
1001    const_f64_as_f64x8!(P1atan, -1.228866684490136173410E2);
1002    const_f64_as_f64x8!(P0atan, -6.485021904942025371773E1);
1003
1004    const_f64_as_f64x8!(Q4atan, 2.485846490142306297962E1);
1005    const_f64_as_f64x8!(Q3atan, 1.650270098316988542046E2);
1006    const_f64_as_f64x8!(Q2atan, 4.328810604912902668951E2);
1007    const_f64_as_f64x8!(Q1atan, 4.853903996359136964868E2);
1008    const_f64_as_f64x8!(Q0atan, 1.945506571482613964425E2);
1009
1010    let t = self.abs();
1011
1012    // small:  t < 0.66
1013    // medium: t <= t <= 2.4142 (1+sqrt(2))
1014    // big:    t > 2.4142
1015    let notbig = t.simd_le(T3PO8);
1016    let notsmal = t.simd_ge(Self::splat(0.66));
1017
1018    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1019    s = notsmal & s;
1020    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1021    fac = notsmal & fac;
1022
1023    // small:  z = t / 1.0;
1024    // medium: z = (t-1.0) / (t+1.0);
1025    // big:    z = -1.0 / t;
1026    let mut a = notbig & t;
1027    a = notsmal.blend(a - Self::ONE, a);
1028    let mut b = notbig & Self::ONE;
1029    b = notsmal.blend(b + t, b);
1030    let z = a / b;
1031
1032    let zz = z * z;
1033
1034    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1035    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1036
1037    let mut re = (px / qx).mul_add(z * zz, z);
1038    re += s + fac;
1039
1040    // get sign bit
1041    re = (self.sign_bit()).blend(-re, re);
1042
1043    re
1044  }
1045
1046  #[inline]
1047  pub fn atan2(self, x: Self) -> Self {
1048    // Based on the Agner Fog "vector class library":
1049    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1050    const_f64_as_f64x8!(MORE_BITS, 6.123233995736765886130E-17);
1051    const_f64_as_f64x8!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
1052    const_f64_as_f64x8!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
1053
1054    const_f64_as_f64x8!(P4atan, -8.750608600031904122785E-1);
1055    const_f64_as_f64x8!(P3atan, -1.615753718733365076637E1);
1056    const_f64_as_f64x8!(P2atan, -7.500855792314704667340E1);
1057    const_f64_as_f64x8!(P1atan, -1.228866684490136173410E2);
1058    const_f64_as_f64x8!(P0atan, -6.485021904942025371773E1);
1059
1060    const_f64_as_f64x8!(Q4atan, 2.485846490142306297962E1);
1061    const_f64_as_f64x8!(Q3atan, 1.650270098316988542046E2);
1062    const_f64_as_f64x8!(Q2atan, 4.328810604912902668951E2);
1063    const_f64_as_f64x8!(Q1atan, 4.853903996359136964868E2);
1064    const_f64_as_f64x8!(Q0atan, 1.945506571482613964425E2);
1065
1066    let y = self;
1067
1068    // move in first octant
1069    let x1 = x.abs();
1070    let y1 = y.abs();
1071    let swapxy = y1.simd_gt(x1);
1072    // swap x and y if y1 > x1
1073    let mut x2 = swapxy.blend(y1, x1);
1074    let mut y2 = swapxy.blend(x1, y1);
1075
1076    // check for special case: x and y are both +/- INF
1077    let both_infinite = x.is_inf() & y.is_inf();
1078    if both_infinite.any() {
1079      let minus_one = -Self::ONE;
1080      x2 = both_infinite.blend(x2 & minus_one, x2);
1081      y2 = both_infinite.blend(y2 & minus_one, y2);
1082    }
1083
1084    // x = y = 0 gives NAN here
1085    let t = y2 / x2;
1086
1087    // small:  t < 0.66
1088    // medium: t <= t <= 2.4142 (1+sqrt(2))
1089    // big:    t > 2.4142
1090    let notbig = t.simd_le(T3PO8);
1091    let notsmal = t.simd_ge(Self::splat(0.66));
1092
1093    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1094    s = notsmal & s;
1095    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1096    fac = notsmal & fac;
1097
1098    // small:  z = t / 1.0;
1099    // medium: z = (t-1.0) / (t+1.0);
1100    // big:    z = -1.0 / t;
1101    let mut a = notbig & t;
1102    a = notsmal.blend(a - Self::ONE, a);
1103    let mut b = notbig & Self::ONE;
1104    b = notsmal.blend(b + t, b);
1105    let z = a / b;
1106
1107    let zz = z * z;
1108
1109    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1110    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1111
1112    let mut re = (px / qx).mul_add(z * zz, z);
1113    re += s + fac;
1114
1115    // move back in place
1116    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1117    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1118    re = (x.sign_bit()).blend(Self::PI - re, re);
1119
1120    // get sign bit
1121    re = (y.sign_bit()).blend(-re, re);
1122
1123    re
1124  }
1125
1126  #[inline]
1127  #[must_use]
1128  pub fn sin_cos(self) -> (Self, Self) {
1129    // Based on the Agner Fog "vector class library":
1130    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1131
1132    const_f64_as_f64x8!(P0sin, -1.66666666666666307295E-1);
1133    const_f64_as_f64x8!(P1sin, 8.33333333332211858878E-3);
1134    const_f64_as_f64x8!(P2sin, -1.98412698295895385996E-4);
1135    const_f64_as_f64x8!(P3sin, 2.75573136213857245213E-6);
1136    const_f64_as_f64x8!(P4sin, -2.50507477628578072866E-8);
1137    const_f64_as_f64x8!(P5sin, 1.58962301576546568060E-10);
1138
1139    const_f64_as_f64x8!(P0cos, 4.16666666666665929218E-2);
1140    const_f64_as_f64x8!(P1cos, -1.38888888888730564116E-3);
1141    const_f64_as_f64x8!(P2cos, 2.48015872888517045348E-5);
1142    const_f64_as_f64x8!(P3cos, -2.75573141792967388112E-7);
1143    const_f64_as_f64x8!(P4cos, 2.08757008419747316778E-9);
1144    const_f64_as_f64x8!(P5cos, -1.13585365213876817300E-11);
1145
1146    const_f64_as_f64x8!(DP1, 7.853981554508209228515625E-1 * 2.);
1147    const_f64_as_f64x8!(DP2, 7.94662735614792836714E-9 * 2.);
1148    const_f64_as_f64x8!(DP3, 3.06161699786838294307E-17 * 2.);
1149
1150    const_f64_as_f64x8!(TWO_OVER_PI, 2.0 / core::f64::consts::PI);
1151
1152    let xa = self.abs();
1153
1154    let y = (xa * TWO_OVER_PI).round();
1155    let q = y.round_int();
1156
1157    let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa)));
1158
1159    let x2 = x * x;
1160    let mut s = polynomial_5!(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
1161    let mut c = polynomial_5!(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
1162    s = (x * x2).mul_add(s, x);
1163    c =
1164      (x2 * x2).mul_add(c, x2.mul_neg_add(f64x8::from(0.5), f64x8::from(1.0)));
1165
1166    let swap = !((q & i64x8::from(1)).simd_eq(i64x8::from(0)));
1167
1168    let mut overflow: f64x8 = cast(q.simd_gt(i64x8::from(0x80000000000000)));
1169    overflow &= xa.is_finite();
1170    s = overflow.blend(f64x8::from(0.0), s);
1171    c = overflow.blend(f64x8::from(1.0), c);
1172
1173    // calc sin
1174    let mut sin1 = cast::<_, f64x8>(swap).blend(c, s);
1175    let sign_sin: i64x8 = (q << 62) ^ cast::<_, i64x8>(self);
1176    sin1 = sin1.flip_signs(cast(sign_sin));
1177
1178    // calc cos
1179    let mut cos1 = cast::<_, f64x8>(swap).blend(s, c);
1180    let sign_cos: i64x8 = ((q + i64x8::from(1)) & i64x8::from(2)) << 62;
1181    cos1 ^= cast::<_, f64x8>(sign_cos);
1182
1183    (sin1, cos1)
1184  }
1185  #[inline]
1186  #[must_use]
1187  pub fn sin(self) -> Self {
1188    let (s, _) = self.sin_cos();
1189    s
1190  }
1191  #[inline]
1192  #[must_use]
1193  pub fn cos(self) -> Self {
1194    let (_, c) = self.sin_cos();
1195    c
1196  }
1197  #[inline]
1198  #[must_use]
1199  pub fn tan(self) -> Self {
1200    let (s, c) = self.sin_cos();
1201    s / c
1202  }
1203  #[inline]
1204  #[must_use]
1205  pub fn to_degrees(self) -> Self {
1206    const_f64_as_f64x8!(RAD_TO_DEG_RATIO, 180.0_f64 / core::f64::consts::PI);
1207    self * RAD_TO_DEG_RATIO
1208  }
1209  #[inline]
1210  #[must_use]
1211  pub fn to_radians(self) -> Self {
1212    const_f64_as_f64x8!(DEG_TO_RAD_RATIO, core::f64::consts::PI / 180.0_f64);
1213    self * DEG_TO_RAD_RATIO
1214  }
1215  #[inline]
1216  #[must_use]
1217  pub fn sqrt(self) -> Self {
1218    pick! {
1219      if #[cfg(target_feature="avx512f")] {
1220        Self { avx512: sqrt_m512d(self.avx512) }
1221      } else {
1222        Self {
1223          a : self.a.sqrt(),
1224          b : self.b.sqrt(),
1225        }
1226      }
1227    }
1228  }
1229  #[inline]
1230  #[must_use]
1231  #[doc(alias("movemask", "move_mask"))]
1232  pub fn to_bitmask(self) -> u32 {
1233    pick! {
1234      if #[cfg(target_feature="avx512f")] {
1235        movepi64_mask_m512d(self.avx512) as u32
1236      } else {
1237        (self.b.to_bitmask() << 4) | self.a.to_bitmask()
1238      }
1239    }
1240  }
1241  #[inline]
1242  #[must_use]
1243  pub fn any(self) -> bool {
1244    pick! {
1245      if #[cfg(target_feature="avx512f")] {
1246        movepi64_mask_m512d(self.avx512) != 0
1247      } else {
1248        self.a.any() || self.b.any()
1249      }
1250    }
1251  }
1252  #[inline]
1253  #[must_use]
1254  pub fn all(self) -> bool {
1255    pick! {
1256      if #[cfg(target_feature="avx512f")] {
1257        movepi64_mask_m512d(self.avx512) == 0b11111111
1258      } else {
1259        self.a.all() && self.b.all()
1260      }
1261    }
1262  }
1263
1264  #[inline]
1265  #[must_use]
1266  pub fn none(self) -> bool {
1267    !self.any()
1268  }
1269
1270  #[inline]
1271  fn vm_pow2n(self) -> Self {
1272    const_f64_as_f64x8!(pow2_52, 4503599627370496.0);
1273    const_f64_as_f64x8!(bias, 1023.0);
1274    let a = self + (bias + pow2_52);
1275    let c = cast::<_, i64x8>(a) << 52;
1276    cast::<_, f64x8>(c)
1277  }
1278
1279  /// Calculate the exponent of a packed `f64x8`
1280  #[inline]
1281  #[must_use]
1282  pub fn exp(self) -> Self {
1283    const_f64_as_f64x8!(P2, 1.0 / 2.0);
1284    const_f64_as_f64x8!(P3, 1.0 / 6.0);
1285    const_f64_as_f64x8!(P4, 1. / 24.);
1286    const_f64_as_f64x8!(P5, 1. / 120.);
1287    const_f64_as_f64x8!(P6, 1. / 720.);
1288    const_f64_as_f64x8!(P7, 1. / 5040.);
1289    const_f64_as_f64x8!(P8, 1. / 40320.);
1290    const_f64_as_f64x8!(P9, 1. / 362880.);
1291    const_f64_as_f64x8!(P10, 1. / 3628800.);
1292    const_f64_as_f64x8!(P11, 1. / 39916800.);
1293    const_f64_as_f64x8!(P12, 1. / 479001600.);
1294    const_f64_as_f64x8!(P13, 1. / 6227020800.);
1295    const_f64_as_f64x8!(LN2D_HI, 0.693145751953125);
1296    const_f64_as_f64x8!(LN2D_LO, 1.42860682030941723212E-6);
1297    let max_x = f64x8::from(708.39);
1298    let r = (self * Self::LOG2_E).round();
1299    let x = r.mul_neg_add(LN2D_HI, self);
1300    let x = r.mul_neg_add(LN2D_LO, x);
1301    let z =
1302      polynomial_13!(x, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13);
1303    let n2 = Self::vm_pow2n(r);
1304    let z = (z + Self::ONE) * n2;
1305    // check for overflow
1306    let in_range = self.abs().simd_lt(max_x);
1307    let in_range = in_range & self.is_finite();
1308    in_range.blend(z, Self::ZERO)
1309  }
1310
1311  #[inline]
1312  fn exponent(self) -> f64x8 {
1313    const_f64_as_f64x8!(pow2_52, 4503599627370496.0);
1314    const_f64_as_f64x8!(bias, 1023.0);
1315    let a = cast::<_, u64x8>(self);
1316    let b = a >> 52;
1317    let c = b | cast::<_, u64x8>(pow2_52);
1318    let d = cast::<_, f64x8>(c);
1319    let e = d - (pow2_52 + bias);
1320    e
1321  }
1322
1323  #[inline]
1324  fn fraction_2(self) -> Self {
1325    let t1 = cast::<_, u64x8>(self);
1326    let t2 = cast::<_, u64x8>(
1327      (t1 & u64x8::from(0x000FFFFFFFFFFFFF)) | u64x8::from(0x3FE0000000000000),
1328    );
1329    cast::<_, f64x8>(t2)
1330  }
1331  #[inline]
1332  fn is_zero_or_subnormal(self) -> Self {
1333    let t = cast::<_, i64x8>(self);
1334    let t = t & i64x8::splat(0x7FF0000000000000);
1335    i64x8::round_float(t.simd_eq(i64x8::splat(0)))
1336  }
1337  #[inline]
1338  fn infinity() -> Self {
1339    cast::<_, f64x8>(i64x8::splat(0x7FF0000000000000))
1340  }
1341  #[inline]
1342  fn nan_log() -> Self {
1343    cast::<_, f64x8>(i64x8::splat(0x7FF8000000000000 | 0x101 << 29))
1344  }
1345  #[inline]
1346  fn nan_pow() -> Self {
1347    cast::<_, f64x8>(i64x8::splat(0x7FF8000000000000 | 0x101 << 29))
1348  }
1349  #[inline]
1350  fn sign_bit(self) -> Self {
1351    let t1 = cast::<_, i64x8>(self);
1352    let t2 = t1 >> 63;
1353    !cast::<_, f64x8>(t2).simd_eq(f64x8::ZERO)
1354  }
1355
1356  #[inline]
1357  pub fn reduce_add(self) -> f64 {
1358    pick! {
1359      if #[cfg(target_feature="avx512f")] {
1360        // From https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
1361        let lo = cast_to_m256d_from_m512d(self.avx512);
1362        let hi = extract_m256d_from_m512d::<1>(self.avx512);
1363        let v  = add_m256d(lo, hi);                // [a0+a4, a1+a5, a2+a6, a3+a7]
1364        let t  = add_horizontal_m256d(v, v);       // [s01, s23, s01, s23]
1365        let lo = cast_to_m128d_from_m256d(t);      // s01
1366        let hi = extract_m128d_from_m256d::<1>(t); // s23
1367        let s  = add_m128d(lo, hi);                // [sum, ...]
1368        get_f64_from_m128d_s(s)
1369      } else {
1370        self.a.reduce_add() + self.b.reduce_add()
1371      }
1372    }
1373  }
1374
1375  /// Natural log (ln(x))
1376  #[inline]
1377  #[must_use]
1378  pub fn ln(self) -> Self {
1379    const_f64_as_f64x8!(HALF, 0.5);
1380    const_f64_as_f64x8!(P0, 7.70838733755885391666E0);
1381    const_f64_as_f64x8!(P1, 1.79368678507819816313E1);
1382    const_f64_as_f64x8!(P2, 1.44989225341610930846E1);
1383    const_f64_as_f64x8!(P3, 4.70579119878881725854E0);
1384    const_f64_as_f64x8!(P4, 4.97494994976747001425E-1);
1385    const_f64_as_f64x8!(P5, 1.01875663804580931796E-4);
1386
1387    const_f64_as_f64x8!(Q0, 2.31251620126765340583E1);
1388    const_f64_as_f64x8!(Q1, 7.11544750618563894466E1);
1389    const_f64_as_f64x8!(Q2, 8.29875266912776603211E1);
1390    const_f64_as_f64x8!(Q3, 4.52279145837532221105E1);
1391    const_f64_as_f64x8!(Q4, 1.12873587189167450590E1);
1392    const_f64_as_f64x8!(LN2F_HI, 0.693359375);
1393    const_f64_as_f64x8!(LN2F_LO, -2.12194440e-4);
1394    const_f64_as_f64x8!(VM_SQRT2, 1.414213562373095048801);
1395    const_f64_as_f64x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1396
1397    let x1 = self;
1398    let x = Self::fraction_2(x1);
1399    let e = Self::exponent(x1);
1400    let mask = x.simd_gt(VM_SQRT2 * HALF);
1401    let x = (!mask).blend(x + x, x);
1402    let fe = mask.blend(e + Self::ONE, e);
1403    let x = x - Self::ONE;
1404    let px = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1405    let x2 = x * x;
1406    let px = x2 * x * px;
1407    let qx = polynomial_5n!(x, Q0, Q1, Q2, Q3, Q4);
1408    let res = px / qx;
1409    let res = fe.mul_add(LN2F_LO, res);
1410    let res = res + x2.mul_neg_add(HALF, x);
1411    let res = fe.mul_add(LN2F_HI, res);
1412    let overflow = !self.is_finite();
1413    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1414    let mask = overflow | underflow;
1415    if !mask.any() {
1416      res
1417    } else {
1418      let is_zero = self.is_zero_or_subnormal();
1419      let res = underflow.blend(Self::nan_log(), res);
1420      let res = is_zero.blend(Self::infinity(), res);
1421      let res = overflow.blend(self, res);
1422      res
1423    }
1424  }
1425
1426  #[inline]
1427  #[must_use]
1428  pub fn log2(self) -> Self {
1429    Self::ln(self) * Self::LOG2_E
1430  }
1431  #[inline]
1432  #[must_use]
1433  pub fn log10(self) -> Self {
1434    Self::ln(self) * Self::LOG10_E
1435  }
1436
1437  #[inline]
1438  #[must_use]
1439  pub fn pow_f64x8(self, y: Self) -> Self {
1440    const_f64_as_f64x8!(ln2d_hi, 0.693145751953125);
1441    const_f64_as_f64x8!(ln2d_lo, 1.42860682030941723212E-6);
1442    const_f64_as_f64x8!(P0log, 2.0039553499201281259648E1);
1443    const_f64_as_f64x8!(P1log, 5.7112963590585538103336E1);
1444    const_f64_as_f64x8!(P2log, 6.0949667980987787057556E1);
1445    const_f64_as_f64x8!(P3log, 2.9911919328553073277375E1);
1446    const_f64_as_f64x8!(P4log, 6.5787325942061044846969E0);
1447    const_f64_as_f64x8!(P5log, 4.9854102823193375972212E-1);
1448    const_f64_as_f64x8!(P6log, 4.5270000862445199635215E-5);
1449    const_f64_as_f64x8!(Q0log, 6.0118660497603843919306E1);
1450    const_f64_as_f64x8!(Q1log, 2.1642788614495947685003E2);
1451    const_f64_as_f64x8!(Q2log, 3.0909872225312059774938E2);
1452    const_f64_as_f64x8!(Q3log, 2.2176239823732856465394E2);
1453    const_f64_as_f64x8!(Q4log, 8.3047565967967209469434E1);
1454    const_f64_as_f64x8!(Q5log, 1.5062909083469192043167E1);
1455
1456    // Taylor expansion constants
1457    const_f64_as_f64x8!(p2, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1458    const_f64_as_f64x8!(p3, 1.0 / 6.0);
1459    const_f64_as_f64x8!(p4, 1.0 / 24.0);
1460    const_f64_as_f64x8!(p5, 1.0 / 120.0);
1461    const_f64_as_f64x8!(p6, 1.0 / 720.0);
1462    const_f64_as_f64x8!(p7, 1.0 / 5040.0);
1463    const_f64_as_f64x8!(p8, 1.0 / 40320.0);
1464    const_f64_as_f64x8!(p9, 1.0 / 362880.0);
1465    const_f64_as_f64x8!(p10, 1.0 / 3628800.0);
1466    const_f64_as_f64x8!(p11, 1.0 / 39916800.0);
1467    const_f64_as_f64x8!(p12, 1.0 / 479001600.0);
1468    const_f64_as_f64x8!(p13, 1.0 / 6227020800.0);
1469
1470    let x1 = self.abs();
1471    let x = x1.fraction_2();
1472    let mask = x.simd_gt(f64x8::SQRT_2 * f64x8::HALF);
1473    let x = (!mask).blend(x + x, x);
1474    let x = x - f64x8::ONE;
1475    let x2 = x * x;
1476    let px = polynomial_6!(x, P0log, P1log, P2log, P3log, P4log, P5log, P6log);
1477    let px = px * x * x2;
1478    let qx = polynomial_6n!(x, Q0log, Q1log, Q2log, Q3log, Q4log, Q5log);
1479    let lg1 = px / qx;
1480
1481    let ef = x1.exponent();
1482    let ef = mask.blend(ef + f64x8::ONE, ef);
1483    let e1 = (ef * y).round();
1484    let yr = ef.mul_sub(y, e1);
1485
1486    let lg = f64x8::HALF.mul_neg_add(x2, x) + lg1;
1487    let x2err = (f64x8::HALF * x).mul_sub(x, f64x8::HALF * x2);
1488    let lg_err = f64x8::HALF.mul_add(x2, lg - x) - lg1;
1489
1490    let e2 = (lg * y * f64x8::LOG2_E).round();
1491    let v = lg.mul_sub(y, e2 * ln2d_hi);
1492    let v = e2.mul_neg_add(ln2d_lo, v);
1493    let v = v - (lg_err + x2err).mul_sub(y, yr * f64x8::LN_2);
1494
1495    let x = v;
1496    let e3 = (x * f64x8::LOG2_E).round();
1497    let x = e3.mul_neg_add(f64x8::LN_2, x);
1498    let z =
1499      polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)
1500        + f64x8::ONE;
1501    let ee = e1 + e2 + e3;
1502    let ei = cast::<_, i64x8>(ee.round_int());
1503    let ej = cast::<_, i64x8>(ei + (cast::<_, i64x8>(z) >> 52));
1504
1505    let overflow = cast::<_, f64x8>(!ej.simd_lt(i64x8::splat(0x07FF)))
1506      | ee.simd_gt(f64x8::splat(3000.0));
1507    let underflow = cast::<_, f64x8>(!ej.simd_gt(i64x8::splat(0x000)))
1508      | ee.simd_lt(f64x8::splat(-3000.0));
1509
1510    // Add exponent by integer addition
1511    let z = cast::<_, f64x8>(cast::<_, i64x8>(z) + (ei << 52));
1512
1513    // Check for overflow/underflow
1514    let z = if (overflow | underflow).any() {
1515      let z = underflow.blend(f64x8::ZERO, z);
1516      overflow.blend(Self::infinity(), z)
1517    } else {
1518      z
1519    };
1520
1521    // Check for self == 0
1522    let x_zero = self.is_zero_or_subnormal();
1523    let z = x_zero.blend(
1524      y.simd_lt(f64x8::ZERO).blend(
1525        Self::infinity(),
1526        y.simd_eq(f64x8::ZERO).blend(f64x8::ONE, f64x8::ZERO),
1527      ),
1528      z,
1529    );
1530
1531    let x_sign = self.sign_bit();
1532
1533    let z = if x_sign.any() {
1534      // Y into an integer
1535      let yi = y.simd_eq(y.round());
1536      // Is y odd?
1537      let y_odd = cast::<_, i64x8>(y.round_int() << 63).round_float();
1538      let z1 =
1539        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1540      x_sign.blend(z1, z)
1541    } else {
1542      z
1543    };
1544
1545    let x_finite = self.is_finite();
1546    let y_finite = y.is_finite();
1547    let e_finite = ee.is_finite();
1548
1549    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1550      return z;
1551    }
1552
1553    (self.is_nan() | y.is_nan()).blend(self + y, z)
1554  }
1555
1556  #[inline]
1557  pub fn powf(self, y: f64) -> Self {
1558    Self::pow_f64x8(self, f64x8::splat(y))
1559  }
1560
1561  #[inline]
1562  #[must_use]
1563  pub fn to_array(self) -> [f64; 8] {
1564    cast(self)
1565  }
1566
1567  #[inline]
1568  #[must_use]
1569  pub fn as_array(&self) -> &[f64; 8] {
1570    cast_ref(self)
1571  }
1572
1573  #[inline]
1574  #[must_use]
1575  pub fn as_mut_array(&mut self) -> &mut [f64; 8] {
1576    cast_mut(self)
1577  }
1578
1579  #[inline]
1580  pub fn from_i32x8(v: i32x8) -> Self {
1581    pick! {
1582      if #[cfg(target_feature="avx512f")] {
1583        Self { avx512: convert_to_m512d_from_i32_m256i(v.avx2) }
1584      } else {
1585        Self::new([
1586          v.as_array()[0] as f64,
1587          v.as_array()[1] as f64,
1588          v.as_array()[2] as f64,
1589          v.as_array()[3] as f64,
1590          v.as_array()[4] as f64,
1591          v.as_array()[5] as f64,
1592          v.as_array()[6] as f64,
1593          v.as_array()[7] as f64,
1594        ])
1595      }
1596    }
1597  }
1598}
1599
1600impl From<i32x8> for f64x8 {
1601  #[inline]
1602  fn from(v: i32x8) -> Self {
1603    Self::from_i32x8(v)
1604  }
1605}
1606
1607impl Not for f64x8 {
1608  type Output = Self;
1609  #[inline]
1610  fn not(self) -> Self::Output {
1611    pick! {
1612      if #[cfg(target_feature="avx512f")] {
1613        Self { avx512: bitxor_m512d(self.avx512, set_splat_m512d(f64::from_bits(u64::MAX))) }
1614      } else {
1615        Self {
1616          a : self.a.not(),
1617          b : self.b.not(),
1618        }
1619      }
1620    }
1621  }
1622}
wide/f64x8_.rs

wide/
f64x8_.rs