wide/
f32x16_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx512f")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(64))]
7    pub struct f32x16 { pub(crate) avx512: m512 }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(64))]
11    pub struct f32x16 { pub(crate) a : f32x8, pub(crate) b : f32x8 }
12  }
13}
14
15macro_rules! const_f32_as_f32x16 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f32x16 = f32x16::new([$f; 16]);
19  };
20}
21
22impl f32x16 {
23  const_f32_as_f32x16!(ONE, 1.0);
24  const_f32_as_f32x16!(HALF, 0.5);
25  const_f32_as_f32x16!(ZERO, 0.0);
26  const_f32_as_f32x16!(E, core::f32::consts::E);
27  const_f32_as_f32x16!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
28  const_f32_as_f32x16!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
29  const_f32_as_f32x16!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
30  const_f32_as_f32x16!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
31  const_f32_as_f32x16!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
32  const_f32_as_f32x16!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
33  const_f32_as_f32x16!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
34  const_f32_as_f32x16!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
35  const_f32_as_f32x16!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
36  const_f32_as_f32x16!(LN_2, core::f32::consts::LN_2);
37  const_f32_as_f32x16!(LN_10, core::f32::consts::LN_10);
38  const_f32_as_f32x16!(LOG2_E, core::f32::consts::LOG2_E);
39  const_f32_as_f32x16!(LOG10_E, core::f32::consts::LOG10_E);
40  const_f32_as_f32x16!(LOG10_2, core::f32::consts::LOG10_2);
41  const_f32_as_f32x16!(LOG2_10, core::f32::consts::LOG2_10);
42  const_f32_as_f32x16!(PI, core::f32::consts::PI);
43  const_f32_as_f32x16!(SQRT_2, core::f32::consts::SQRT_2);
44  const_f32_as_f32x16!(TAU, core::f32::consts::TAU);
45}
46
47unsafe impl Zeroable for f32x16 {}
48unsafe impl Pod for f32x16 {}
49
50impl AlignTo for f32x16 {
51  type Elem = f32;
52}
53
54impl Add for f32x16 {
55  type Output = Self;
56  #[inline]
57  fn add(self, rhs: Self) -> Self::Output {
58    pick! {
59      if #[cfg(target_feature="avx512f")] {
60        Self { avx512: add_m512(self.avx512, rhs.avx512) }
61      } else {
62        Self {
63          a : self.a.add(rhs.a),
64          b : self.b.add(rhs.b),
65        }
66      }
67    }
68  }
69}
70
71impl Sub for f32x16 {
72  type Output = Self;
73  #[inline]
74  fn sub(self, rhs: Self) -> Self::Output {
75    pick! {
76      if #[cfg(target_feature="avx512f")] {
77        Self { avx512: sub_m512(self.avx512, rhs.avx512) }
78      } else {
79        Self {
80          a : self.a.sub(rhs.a),
81          b : self.b.sub(rhs.b),
82        }
83      }
84    }
85  }
86}
87
88impl Mul for f32x16 {
89  type Output = Self;
90  #[inline]
91  fn mul(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="avx512f")] {
94        Self { avx512: mul_m512(self.avx512, rhs.avx512) }
95      } else {
96        Self { a: self.a.mul(rhs.a), b: self.b.mul(rhs.b) }
97      }
98    }
99  }
100}
101
102impl Div for f32x16 {
103  type Output = Self;
104  #[inline]
105  fn div(self, rhs: Self) -> Self::Output {
106    pick! {
107      if #[cfg(target_feature="avx512f")] {
108        Self { avx512: div_m512(self.avx512, rhs.avx512) }
109      } else {
110        Self { a: self.a.div(rhs.a), b: self.b.div(rhs.b) }
111      }
112    }
113  }
114}
115
116impl Neg for f32x16 {
117  type Output = Self;
118  #[inline]
119  fn neg(self) -> Self::Output {
120    pick! {
121      if #[cfg(target_feature="avx512f")] {
122        Self { avx512: bitxor_m512(self.avx512, Self::splat(-0.0).avx512) }
123      } else {
124        Self {
125          a : self.a.neg(),
126          b : self.b.neg(),
127        }
128      }
129    }
130  }
131}
132
133impl Add<f32> for f32x16 {
134  type Output = Self;
135  #[inline]
136  fn add(self, rhs: f32) -> Self::Output {
137    self.add(Self::splat(rhs))
138  }
139}
140
141impl Sub<f32> for f32x16 {
142  type Output = Self;
143  #[inline]
144  fn sub(self, rhs: f32) -> Self::Output {
145    self.sub(Self::splat(rhs))
146  }
147}
148
149impl Mul<f32> for f32x16 {
150  type Output = Self;
151  #[inline]
152  fn mul(self, rhs: f32) -> Self::Output {
153    self.mul(Self::splat(rhs))
154  }
155}
156
157impl Div<f32> for f32x16 {
158  type Output = Self;
159  #[inline]
160  fn div(self, rhs: f32) -> Self::Output {
161    self.div(Self::splat(rhs))
162  }
163}
164
165impl Add<f32x16> for f32 {
166  type Output = f32x16;
167  #[inline]
168  fn add(self, rhs: f32x16) -> Self::Output {
169    f32x16::splat(self).add(rhs)
170  }
171}
172
173impl Sub<f32x16> for f32 {
174  type Output = f32x16;
175  #[inline]
176  fn sub(self, rhs: f32x16) -> Self::Output {
177    f32x16::splat(self).sub(rhs)
178  }
179}
180
181impl Mul<f32x16> for f32 {
182  type Output = f32x16;
183  #[inline]
184  fn mul(self, rhs: f32x16) -> Self::Output {
185    f32x16::splat(self).mul(rhs)
186  }
187}
188
189impl Div<f32x16> for f32 {
190  type Output = f32x16;
191  #[inline]
192  fn div(self, rhs: f32x16) -> Self::Output {
193    f32x16::splat(self).div(rhs)
194  }
195}
196
197impl BitAnd for f32x16 {
198  type Output = Self;
199  #[inline]
200  fn bitand(self, rhs: Self) -> Self::Output {
201    pick! {
202      if #[cfg(target_feature="avx512f")] {
203        Self { avx512: bitand_m512(self.avx512, rhs.avx512) }
204      } else {
205        Self {
206          a : self.a.bitand(rhs.a),
207          b : self.b.bitand(rhs.b),
208        }
209      }
210    }
211  }
212}
213
214impl BitOr for f32x16 {
215  type Output = Self;
216  #[inline]
217  fn bitor(self, rhs: Self) -> Self::Output {
218    pick! {
219    if #[cfg(target_feature="avx512f")] {
220        Self { avx512: bitor_m512(self.avx512, rhs.avx512) }
221      } else {
222        Self {
223          a : self.a.bitor(rhs.a),
224          b : self.b.bitor(rhs.b),
225        }
226      }
227    }
228  }
229}
230
231impl BitXor for f32x16 {
232  type Output = Self;
233  #[inline]
234  fn bitxor(self, rhs: Self) -> Self::Output {
235    pick! {
236      if #[cfg(target_feature="avx512f")] {
237        Self { avx512: bitxor_m512(self.avx512, rhs.avx512) }
238      } else {
239        Self {
240          a : self.a.bitxor(rhs.a),
241          b : self.b.bitxor(rhs.b),
242        }
243      }
244    }
245  }
246}
247
248impl CmpEq for f32x16 {
249  type Output = Self;
250  #[inline]
251  fn simd_eq(self, rhs: Self) -> Self::Output {
252    pick! {
253      if #[cfg(target_feature="avx512f")] {
254        Self { avx512: cmp_op_mask_m512::<{cmp_op!(EqualOrdered)}>(self.avx512, rhs.avx512) }
255      } else {
256        Self {
257          a : self.a.simd_eq(rhs.a),
258          b : self.b.simd_eq(rhs.b),
259        }
260      }
261    }
262  }
263}
264
265impl CmpGt for f32x16 {
266  type Output = Self;
267  #[inline]
268  fn simd_gt(self, rhs: Self) -> Self::Output {
269    pick! {
270      if #[cfg(target_feature="avx512f")] {
271        Self { avx512: cmp_op_mask_m512::<{cmp_op!(GreaterThanOrdered)}>(self.avx512, rhs.avx512) }
272      } else {
273        Self {
274          a : self.a.simd_gt(rhs.a),
275          b : self.b.simd_gt(rhs.b),
276        }
277      }
278    }
279  }
280}
281
282impl CmpGe for f32x16 {
283  type Output = Self;
284  #[inline]
285  fn simd_ge(self, rhs: Self) -> Self::Output {
286    pick! {
287      if #[cfg(target_feature="avx512f")] {
288        Self { avx512: cmp_op_mask_m512::<{cmp_op!(GreaterEqualOrdered)}>(self.avx512, rhs.avx512) }
289      } else {
290        Self {
291          a : self.a.simd_ge(rhs.a),
292          b : self.b.simd_ge(rhs.b),
293        }
294      }
295    }
296  }
297}
298
299impl CmpLt for f32x16 {
300  type Output = Self;
301  #[inline]
302  fn simd_lt(self, rhs: Self) -> Self::Output {
303    pick! {
304      if #[cfg(target_feature="avx512f")] {
305        Self { avx512: cmp_op_mask_m512::<{cmp_op!(LessThanOrdered)}>(self.avx512, rhs.avx512) }
306      } else {
307        Self {
308          a : self.a.simd_lt(rhs.a),
309          b : self.b.simd_lt(rhs.b),
310        }
311      }
312    }
313  }
314}
315
316impl CmpLe for f32x16 {
317  type Output = Self;
318  #[inline]
319  fn simd_le(self, rhs: Self) -> Self::Output {
320    pick! {
321      if #[cfg(target_feature="avx512f")] {
322        Self { avx512: cmp_op_mask_m512::<{cmp_op!(LessEqualOrdered)}>(self.avx512, rhs.avx512) }
323      } else {
324        Self {
325          a : self.a.simd_le(rhs.a),
326          b : self.b.simd_le(rhs.b),
327        }
328      }
329    }
330  }
331}
332
333impl CmpNe for f32x16 {
334  type Output = Self;
335  #[inline]
336  fn simd_ne(self, rhs: Self) -> Self::Output {
337    pick! {
338      if #[cfg(target_feature="avx512f")] {
339        Self { avx512: cmp_op_mask_m512::<{cmp_op!(NotEqualOrdered)}>(self.avx512, rhs.avx512) }
340      } else {
341        Self {
342          a : self.a.simd_ne(rhs.a),
343          b : self.b.simd_ne(rhs.b),
344        }
345      }
346    }
347  }
348}
349
350impl f32x16 {
351  #[inline]
352  #[must_use]
353  pub const fn new(array: [f32; 16]) -> Self {
354    unsafe { core::mem::transmute(array) }
355  }
356
357  #[inline]
358  #[must_use]
359  pub fn blend(self, t: Self, f: Self) -> Self {
360    pick! {
361      if #[cfg(target_feature="avx512f")] {
362        Self { avx512: blend_varying_m512(f.avx512, t.avx512, movepi32_mask_m512(self.avx512)) }
363      } else {
364        Self {
365          a : self.a.blend(t.a, f.a),
366          b : self.b.blend(t.b, f.b),
367        }
368      }
369    }
370  }
371
372  #[inline]
373  #[must_use]
374  pub fn abs(self) -> Self {
375    pick! {
376      if #[cfg(target_feature="avx512f")] {
377        let non_sign_bits = f32x16::from(f32::from_bits(i32::MAX as u32));
378        self & non_sign_bits
379      } else {
380        Self {
381          a : self.a.abs(),
382          b : self.b.abs(),
383        }
384      }
385    }
386  }
387
388  #[inline]
389  #[must_use]
390  pub fn floor(self) -> Self {
391    pick! {
392      if #[cfg(target_feature="avx512f")] {
393        Self { avx512: round_m512::<{round_op!(NegInf)}>(self.avx512) }
394      } else {
395        Self {
396          a : self.a.floor(),
397          b : self.b.floor(),
398        }
399      }
400    }
401  }
402
403  #[inline]
404  #[must_use]
405  pub fn ceil(self) -> Self {
406    pick! {
407      if #[cfg(target_feature="avx512f")] {
408        Self { avx512: round_m512::<{round_op!(PosInf)}>(self.avx512) }
409      } else {
410        Self {
411          a : self.a.ceil(),
412          b : self.b.ceil(),
413        }
414      }
415    }
416  }
417
418  /// Calculates the lanewise maximum of both vectors. This is a faster
419  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
420  /// involved.
421  #[inline]
422  #[must_use]
423  pub fn fast_max(self, rhs: Self) -> Self {
424    pick! {
425      if #[cfg(target_feature="avx512f")] {
426        Self { avx512: max_m512(self.avx512, rhs.avx512) }
427      } else {
428        Self {
429          a: self.a.fast_max(rhs.a),
430          b: self.b.fast_max(rhs.b),
431        }
432      }
433    }
434  }
435
436  #[inline]
437  #[must_use]
438  pub fn max(self, rhs: Self) -> Self {
439    pick! {
440      if #[cfg(target_feature="avx512f")] {
441        // max_m512 seems to do rhs < self ? self : rhs. So if there's any NaN
442        // involved, it chooses rhs, so we need to specifically check rhs for
443        // NaN.
444        rhs.is_nan().blend(self, Self { avx512: max_m512(self.avx512, rhs.avx512) })
445      } else {
446        Self {
447          a: self.a.max(rhs.a),
448          b: self.b.max(rhs.b),
449        }
450      }
451    }
452  }
453
454  /// Calculates the lanewise minimum of both vectors. This is a faster
455  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
456  /// involved.
457  #[inline]
458  #[must_use]
459  pub fn fast_min(self, rhs: Self) -> Self {
460    pick! {
461      if #[cfg(target_feature="avx512f")] {
462        Self { avx512: min_m512(self.avx512, rhs.avx512) }
463      } else {
464        Self {
465          a: self.a.fast_min(rhs.a),
466          b: self.b.fast_min(rhs.b),
467        }
468      }
469    }
470  }
471
472  #[inline]
473  #[must_use]
474  pub fn min(self, rhs: Self) -> Self {
475    pick! {
476      if #[cfg(target_feature="avx512f")] {
477        // min_m512 seems to do rhs > self ? self : rhs. So if there's any NaN
478        // involved, it chooses rhs, so we need to specifically check rhs for
479        // NaN.
480        rhs.is_nan().blend(self, Self { avx512: min_m512(self.avx512, rhs.avx512) })
481      } else {
482        Self {
483          a: self.a.min(rhs.a),
484          b: self.b.min(rhs.b),
485        }
486      }
487    }
488  }
489
490  #[inline]
491  #[must_use]
492  pub fn is_nan(self) -> Self {
493    pick! {
494      if #[cfg(target_feature = "avx512f")] {
495        Self { avx512: cmp_op_mask_m512::<{cmp_op!(Unordered)}>(self.avx512, self.avx512) }
496      } else {
497        Self {
498          a: self.a.is_nan(),
499          b: self.b.is_nan(),
500        }
501      }
502    }
503  }
504
505  #[inline]
506  #[must_use]
507  pub fn is_finite(self) -> Self {
508    let shifted_exp_mask = u32x16::splat(0xFF000000);
509    let u: u32x16 = cast(self);
510    let shift_u = u << 1_u32;
511    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
512    cast(out)
513  }
514
515  #[inline]
516  #[must_use]
517  pub fn is_inf(self) -> Self {
518    let shifted_inf = u32x16::from(0xFF000000);
519    let u: u32x16 = cast(self);
520    let shift_u = u << 1_u64;
521    let out = (shift_u).simd_eq(shifted_inf);
522    cast(out)
523  }
524
525  #[inline]
526  #[must_use]
527  pub fn round(self) -> Self {
528    pick! {
529      if #[cfg(target_feature="avx512f")] {
530        Self { avx512: round_m512::<{round_op!(Nearest)}>(self.avx512) }
531      } else {
532        Self {
533          a: self.a.round(),
534          b: self.b.round(),
535        }
536      }
537    }
538  }
539
540  /// Rounds each lane into an integer. This is a faster implementation than
541  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
542  /// values you get implementation defined behavior.
543  #[inline]
544  #[must_use]
545  pub fn fast_round_int(self) -> i32x16 {
546    pick! {
547      if #[cfg(target_feature="avx512f")] {
548        cast(convert_to_i32_m512i_from_m512(self.avx512))
549      } else {
550        i32x16 {
551          a: self.a.fast_round_int(),
552          b: self.b.fast_round_int(),
553        }
554      }
555    }
556  }
557
558  /// Rounds each lane into an integer. This saturates out of range values and
559  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
560  /// doesn't handle out of range values or NaNs.
561  #[inline]
562  #[must_use]
563  pub fn round_int(self) -> i32x16 {
564    pick! {
565      if #[cfg(target_feature="avx512f")] {
566        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
567        let non_nan_mask = self.simd_eq(self);
568        let non_nan = self & non_nan_mask;
569        let flip_to_max: i32x16 = cast(self.simd_ge(Self::splat(2147483648.0)));
570        let cast: i32x16 = cast(convert_to_i32_m512i_from_m512(non_nan.avx512));
571        flip_to_max ^ cast
572      } else {
573        i32x16 {
574          a: self.a.round_int(),
575          b: self.b.round_int(),
576        }
577      }
578    }
579  }
580
581  /// Truncates each lane into an integer. This is a faster implementation than
582  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
583  /// values you get implementation defined behavior.
584  #[inline]
585  #[must_use]
586  pub fn fast_trunc_int(self) -> i32x16 {
587    pick! {
588      if #[cfg(all(target_feature="avx512f"))] {
589        cast(convert_truncate_m512_i32_m512i(self.avx512))
590      } else {
591        cast([
592          self.a.fast_trunc_int(),
593          self.b.fast_trunc_int(),
594        ])
595      }
596    }
597  }
598
599  /// Truncates each lane into an integer. This saturates out of range values
600  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
601  /// that doesn't handle out of range values or NaNs.
602  #[inline]
603  #[must_use]
604  pub fn trunc_int(self) -> i32x16 {
605    pick! {
606        if #[cfg(target_feature="avx512f")] {
607        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
608        let non_nan_mask = self.simd_eq(self);
609        let non_nan = self & non_nan_mask;
610        let flip_to_max: i32x16 = cast(self.simd_ge(Self::splat(2147483648.0)));
611        let cast: i32x16 = cast(convert_truncate_m512_i32_m512i(non_nan.avx512));
612        flip_to_max ^ cast
613      } else {
614        cast([
615          self.a.trunc_int(),
616          self.b.trunc_int(),
617        ])
618      }
619    }
620  }
621
622  /// Performs a multiply-add operation: `self * m + a`
623  ///
624  /// When hardware FMA support is available, this computes the result with a
625  /// single rounding operation. Without FMA support, it falls back to separate
626  /// multiply and add operations with two roundings.
627  ///
628  /// # Platform-specific behavior
629  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfmadd` (single
630  ///   rounding, best accuracy)
631  /// - On `x86`/`x86_64` with AVX-512F only: Uses `(self * m) + a` (two
632  ///   roundings)
633  /// - Other platforms: Delegates to [`f32x8`] (inherits its FMA behavior)
634  ///
635  /// # Examples
636  /// ```
637  /// # use wide::f32x16;
638  /// let a = f32x16::from([1.0; 16]);
639  /// let b = f32x16::from([2.0; 16]);
640  /// let c = f32x16::from([10.0; 16]);
641  ///
642  /// let result = a.mul_add(b, c);
643  ///
644  /// let expected = f32x16::from([12.0; 16]);
645  /// assert_eq!(result, expected);
646  /// ```
647  #[inline]
648  #[must_use]
649  pub fn mul_add(self, m: Self, a: Self) -> Self {
650    pick! {
651      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
652        Self { avx512: fused_mul_add_m512(self.avx512, m.avx512, a.avx512) }
653      } else if #[cfg(target_feature="avx512f")] {
654        // still want to use 512 bit ops
655        (self * m) + a
656      } else {
657        Self {
658          a: self.a.mul_add(m.a, a.a),
659          b: self.b.mul_add(m.b, a.b),
660        }
661      }
662    }
663  }
664
665  /// Performs a multiply-subtract operation: `self * m - s`
666  ///
667  /// When hardware FMA support is available, this computes the result with a
668  /// single rounding operation. Without FMA support, it falls back to separate
669  /// multiply and subtract operations with two roundings.
670  ///
671  /// # Platform-specific behavior
672  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfmsub` (single
673  ///   rounding, best accuracy)
674  /// - On `x86`/`x86_64` with AVX-512F only: Uses `(self * m) - s` (two
675  ///   roundings)
676  /// - Other platforms: Delegates to [`f32x8`] (inherits its FMA behavior)
677  ///
678  /// # Examples
679  /// ```
680  /// # use wide::f32x16;
681  /// let a = f32x16::from([10.0; 16]);
682  /// let b = f32x16::from([3.0; 16]);
683  /// let c = f32x16::from([5.0; 16]);
684  ///
685  /// let result = a.mul_sub(b, c);
686  ///
687  /// let expected = f32x16::from([25.0; 16]);
688  /// assert_eq!(result, expected);
689  /// ```
690  #[inline]
691  #[must_use]
692  pub fn mul_sub(self, m: Self, s: Self) -> Self {
693    pick! {
694      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
695        Self { avx512: fused_mul_sub_m512(self.avx512, m.avx512, s.avx512) }
696      } else if #[cfg(target_feature="avx512f")] {
697        // still want to use 512 bit ops
698        (self * m) - s
699      } else {
700        Self {
701          a: self.a.mul_sub(m.a, s.a),
702          b: self.b.mul_sub(m.b, s.b),
703        }
704      }
705    }
706  }
707
708  /// Performs a negative multiply-add operation: `a - (self * m)`
709  ///
710  /// When hardware FMA support is available, this computes the result with a
711  /// single rounding operation. Without FMA support, it falls back to separate
712  /// operations with two roundings.
713  ///
714  /// # Platform-specific behavior
715  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfnmadd` (single
716  ///   rounding, best accuracy)
717  /// - On `x86`/`x86_64` with AVX-512F only: Uses `a - (self * m)` (two
718  ///   roundings)
719  /// - Other platforms: Delegates to [`f32x8`] (inherits its FMA behavior)
720  ///
721  /// # Examples
722  /// ```
723  /// # use wide::f32x16;
724  /// let a = f32x16::from([4.0; 16]);
725  /// let b = f32x16::from([2.0; 16]);
726  /// let c = f32x16::from([10.0; 16]);
727  ///
728  /// let result = a.mul_neg_add(b, c);
729  ///
730  /// let expected = f32x16::from([2.0; 16]);
731  /// assert_eq!(result, expected);
732  /// ```
733  #[inline]
734  #[must_use]
735  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
736    pick! {
737      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
738        Self { avx512: fused_mul_neg_add_m512(self.avx512, m.avx512, a.avx512) }
739      } else if #[cfg(target_feature="avx512f")] {
740        // still want to use 512 bit ops
741        a - (self * m)
742      } else {
743        Self {
744          a: self.a.mul_neg_add(m.a, a.a),
745          b: self.b.mul_neg_add(m.b, a.b),
746        }
747      }
748    }
749  }
750
751  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
752  ///
753  /// When hardware FMA support is available, this computes the result with a
754  /// single rounding operation. Without FMA support, it falls back to separate
755  /// operations with two roundings.
756  ///
757  /// # Platform-specific behavior
758  /// - On `x86`/`x86_64` with AVX-512F+FMA: Uses 512-bit `vfnmsub` (single
759  ///   rounding, best accuracy)
760  /// - On `x86`/`x86_64` with AVX-512F only: Uses `-(self * m) - s` (two
761  ///   roundings)
762  /// - Other platforms: Delegates to [`f32x8`] (inherits its FMA behavior)
763  ///
764  /// # Examples
765  /// ```
766  /// # use wide::f32x16;
767  /// let a = f32x16::from([4.0; 16]);
768  /// let b = f32x16::from([2.0; 16]);
769  /// let c = f32x16::from([1.0; 16]);
770  ///
771  /// let result = a.mul_neg_sub(b, c);
772  ///
773  /// let expected = f32x16::from([-9.0; 16]);
774  /// assert_eq!(result, expected);
775  /// ```
776  #[inline]
777  #[must_use]
778  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
779    pick! {
780      if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
781        Self { avx512: fused_mul_neg_sub_m512(self.avx512, m.avx512, s.avx512) }
782      } else if #[cfg(target_feature="avx512f")] {
783        // still want to use 512 bit ops
784        -(self * m) - s
785      } else {
786        Self {
787          a: self.a.mul_neg_sub(m.a, s.a),
788          b: self.b.mul_neg_sub(m.b, s.b),
789        }
790      }
791    }
792  }
793
794  #[inline]
795  #[must_use]
796  pub fn flip_signs(self, signs: Self) -> Self {
797    self ^ (signs & Self::from(-0.0))
798  }
799
800  #[inline]
801  #[must_use]
802  pub fn copysign(self, sign: Self) -> Self {
803    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
804    (self & magnitude_mask) | (sign & Self::from(-0.0))
805  }
806
807  #[inline]
808  pub fn asin_acos(self) -> (Self, Self) {
809    // Based on the Agner Fog "vector class library":
810    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
811    const_f32_as_f32x16!(P4asinf, 4.2163199048E-2);
812    const_f32_as_f32x16!(P3asinf, 2.4181311049E-2);
813    const_f32_as_f32x16!(P2asinf, 4.5470025998E-2);
814    const_f32_as_f32x16!(P1asinf, 7.4953002686E-2);
815    const_f32_as_f32x16!(P0asinf, 1.6666752422E-1);
816
817    let xa = self.abs();
818    let big = xa.simd_ge(f32x16::splat(0.5));
819
820    let x1 = f32x16::splat(0.5) * (f32x16::ONE - xa);
821    let x2 = xa * xa;
822    let x3 = big.blend(x1, x2);
823
824    let xb = x1.sqrt();
825
826    let x4 = big.blend(xb, xa);
827
828    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
829    let z = z.mul_add(x3 * x4, x4);
830
831    let z1 = z + z;
832
833    // acos
834    let z3 = self.simd_lt(f32x16::ZERO).blend(f32x16::PI - z1, z1);
835    let z4 = f32x16::FRAC_PI_2 - z.flip_signs(self);
836    let acos = big.blend(z3, z4);
837
838    // asin
839    let z3 = f32x16::FRAC_PI_2 - z1;
840    let asin = big.blend(z3, z);
841    let asin = asin.flip_signs(self);
842
843    (asin, acos)
844  }
845
846  #[inline]
847  #[must_use]
848  pub fn asin(self) -> Self {
849    // Based on the Agner Fog "vector class library":
850    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
851    const_f32_as_f32x16!(P4asinf, 4.2163199048E-2);
852    const_f32_as_f32x16!(P3asinf, 2.4181311049E-2);
853    const_f32_as_f32x16!(P2asinf, 4.5470025998E-2);
854    const_f32_as_f32x16!(P1asinf, 7.4953002686E-2);
855    const_f32_as_f32x16!(P0asinf, 1.6666752422E-1);
856
857    let xa = self.abs();
858    let big = xa.simd_ge(f32x16::splat(0.5));
859
860    let x1 = f32x16::splat(0.5) * (f32x16::ONE - xa);
861    let x2 = xa * xa;
862    let x3 = big.blend(x1, x2);
863
864    let xb = x1.sqrt();
865
866    let x4 = big.blend(xb, xa);
867
868    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
869    let z = z.mul_add(x3 * x4, x4);
870
871    let z1 = z + z;
872
873    // asin
874    let z3 = f32x16::FRAC_PI_2 - z1;
875    let asin = big.blend(z3, z);
876    let asin = asin.flip_signs(self);
877
878    asin
879  }
880
881  #[inline]
882  #[must_use]
883  pub fn acos(self) -> Self {
884    // Based on the Agner Fog "vector class library":
885    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
886    const_f32_as_f32x16!(P4asinf, 4.2163199048E-2);
887    const_f32_as_f32x16!(P3asinf, 2.4181311049E-2);
888    const_f32_as_f32x16!(P2asinf, 4.5470025998E-2);
889    const_f32_as_f32x16!(P1asinf, 7.4953002686E-2);
890    const_f32_as_f32x16!(P0asinf, 1.6666752422E-1);
891
892    let xa = self.abs();
893    let big = xa.simd_ge(f32x16::splat(0.5));
894
895    let x1 = f32x16::splat(0.5) * (f32x16::ONE - xa);
896    let x2 = xa * xa;
897    let x3 = big.blend(x1, x2);
898
899    let xb = x1.sqrt();
900
901    let x4 = big.blend(xb, xa);
902
903    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
904    let z = z.mul_add(x3 * x4, x4);
905
906    let z1 = z + z;
907
908    // acos
909    let z3 = self.simd_lt(f32x16::ZERO).blend(f32x16::PI - z1, z1);
910    let z4 = f32x16::FRAC_PI_2 - z.flip_signs(self);
911    let acos = big.blend(z3, z4);
912
913    acos
914  }
915
916  #[inline]
917  pub fn atan(self) -> Self {
918    // Based on the Agner Fog "vector class library":
919    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
920    const_f32_as_f32x16!(P3atanf, 8.05374449538E-2);
921    const_f32_as_f32x16!(P2atanf, -1.38776856032E-1);
922    const_f32_as_f32x16!(P1atanf, 1.99777106478E-1);
923    const_f32_as_f32x16!(P0atanf, -3.33329491539E-1);
924
925    let t = self.abs();
926
927    // small:  z = t / 1.0;
928    // medium: z = (t-1.0) / (t+1.0);
929    // big:    z = -1.0 / t;
930    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
931    let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
932
933    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
934    s = notsmal & s;
935
936    let mut a = notbig & t;
937    a = notsmal.blend(a - Self::ONE, a);
938    let mut b = notbig & Self::ONE;
939    b = notsmal.blend(b + t, b);
940    let z = a / b;
941
942    let zz = z * z;
943
944    // Taylor expansion
945    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
946    re = re.mul_add(zz * z, z) + s;
947
948    // get sign bit
949    re = (self.sign_bit()).blend(-re, re);
950
951    re
952  }
953
954  #[inline]
955  pub fn atan2(self, x: Self) -> Self {
956    // Based on the Agner Fog "vector class library":
957    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
958    const_f32_as_f32x16!(P3atanf, 8.05374449538E-2);
959    const_f32_as_f32x16!(P2atanf, -1.38776856032E-1);
960    const_f32_as_f32x16!(P1atanf, 1.99777106478E-1);
961    const_f32_as_f32x16!(P0atanf, -3.33329491539E-1);
962
963    let y = self;
964
965    // move in first octant
966    let x1 = x.abs();
967    let y1 = y.abs();
968    let swapxy = y1.simd_gt(x1);
969    // swap x and y if y1 > x1
970    let mut x2 = swapxy.blend(y1, x1);
971    let mut y2 = swapxy.blend(x1, y1);
972
973    // check for special case: x and y are both +/- INF
974    let both_infinite = x.is_inf() & y.is_inf();
975    if both_infinite.any() {
976      let minus_one = -Self::ONE;
977      x2 = both_infinite.blend(x2 & minus_one, x2);
978      y2 = both_infinite.blend(y2 & minus_one, y2);
979    }
980
981    // x = y = 0 will produce NAN. No problem, fixed below
982    let t = y2 / x2;
983
984    // small:  z = t / 1.0;
985    // medium: z = (t-1.0) / (t+1.0);
986    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
987
988    let a = notsmal.blend(t - Self::ONE, t);
989    let b = notsmal.blend(t + Self::ONE, Self::ONE);
990    let s = notsmal & Self::FRAC_PI_4;
991    let z = a / b;
992
993    let zz = z * z;
994
995    // Taylor expansion
996    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
997    re = re.mul_add(zz * z, z) + s;
998
999    // move back in place
1000    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1001    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1002    re = (x.sign_bit()).blend(Self::PI - re, re);
1003
1004    // get sign bit
1005    re = (y.sign_bit()).blend(-re, re);
1006
1007    re
1008  }
1009
1010  #[inline]
1011  #[must_use]
1012  pub fn sin_cos(self) -> (Self, Self) {
1013    // Based on the Agner Fog "vector class library":
1014    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1015
1016    const_f32_as_f32x16!(DP1F, 0.78515625_f32 * 2.0);
1017    const_f32_as_f32x16!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1018    const_f32_as_f32x16!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1019
1020    const_f32_as_f32x16!(P0sinf, -1.6666654611E-1);
1021    const_f32_as_f32x16!(P1sinf, 8.3321608736E-3);
1022    const_f32_as_f32x16!(P2sinf, -1.9515295891E-4);
1023
1024    const_f32_as_f32x16!(P0cosf, 4.166664568298827E-2);
1025    const_f32_as_f32x16!(P1cosf, -1.388731625493765E-3);
1026    const_f32_as_f32x16!(P2cosf, 2.443315711809948E-5);
1027
1028    const_f32_as_f32x16!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1029
1030    let xa = self.abs();
1031
1032    // Find quadrant
1033    let y = (xa * TWO_OVER_PI).round();
1034    let q: i32x16 = y.round_int();
1035
1036    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1037
1038    let x2 = x * x;
1039    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1040    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1041      + f32x16::from(0.5).mul_neg_add(x2, f32x16::from(1.0));
1042
1043    let swap = !(q & i32x16::from(1)).simd_eq(i32x16::from(0));
1044
1045    let mut overflow: f32x16 = cast(q.simd_gt(i32x16::from(0x2000000)));
1046    overflow &= xa.is_finite();
1047    s = overflow.blend(f32x16::from(0.0), s);
1048    c = overflow.blend(f32x16::from(1.0), c);
1049
1050    // calc sin
1051    let mut sin1 = cast::<_, f32x16>(swap).blend(c, s);
1052    let sign_sin: i32x16 = (q << 30) ^ cast::<_, i32x16>(self);
1053    sin1 = sin1.flip_signs(cast(sign_sin));
1054
1055    // calc cos
1056    let mut cos1 = cast::<_, f32x16>(swap).blend(s, c);
1057    let sign_cos: i32x16 = ((q + i32x16::from(1)) & i32x16::from(2)) << 30;
1058    cos1 ^= cast::<_, f32x16>(sign_cos);
1059
1060    (sin1, cos1)
1061  }
1062
1063  #[inline]
1064  #[must_use]
1065  pub fn sin(self) -> Self {
1066    let (s, _) = self.sin_cos();
1067    s
1068  }
1069
1070  #[inline]
1071  #[must_use]
1072  pub fn cos(self) -> Self {
1073    let (_, c) = self.sin_cos();
1074    c
1075  }
1076
1077  #[inline]
1078  #[must_use]
1079  pub fn tan(self) -> Self {
1080    let (s, c) = self.sin_cos();
1081    s / c
1082  }
1083
1084  #[inline]
1085  #[must_use]
1086  pub fn to_degrees(self) -> Self {
1087    const_f32_as_f32x16!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1088    self * RAD_TO_DEG_RATIO
1089  }
1090
1091  #[inline]
1092  #[must_use]
1093  pub fn to_radians(self) -> Self {
1094    const_f32_as_f32x16!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1095    self * DEG_TO_RAD_RATIO
1096  }
1097
1098  #[inline]
1099  #[must_use]
1100  pub fn recip(self) -> Self {
1101    pick! {
1102      if #[cfg(target_feature="avx512f")] {
1103        // TODO: Add `_mm512_rcp14_ps` to `safe_arch`, looks like it is missing,
1104        // then consider updating this implementation if the relative error is
1105        // acceptable.
1106        1.0 / self
1107      } else {
1108        Self {
1109          a : self.a.recip(),
1110          b : self.b.recip(),
1111        }
1112      }
1113    }
1114  }
1115
1116  #[inline]
1117  #[must_use]
1118  pub fn recip_sqrt(self) -> Self {
1119    pick! {
1120      if #[cfg(target_feature="avx512f")] {
1121        // TODO: Add `_mm512_rsqrt14_ps` to `safe_arch`, looks like it is
1122        // missing, then consider updating this implementation if the relative
1123        // error is acceptable.
1124        self.sqrt().recip()
1125      } else {
1126        Self {
1127          a : self.a.recip_sqrt(),
1128          b : self.b.recip_sqrt(),
1129        }
1130      }
1131    }
1132  }
1133
1134  #[inline]
1135  #[must_use]
1136  pub fn sqrt(self) -> Self {
1137    pick! {
1138      if #[cfg(target_feature="avx512f")] {
1139        Self { avx512: sqrt_m512(self.avx512) }
1140      } else {
1141        Self {
1142          a : self.a.sqrt(),
1143          b : self.b.sqrt(),
1144        }
1145      }
1146    }
1147  }
1148
1149  #[inline]
1150  #[must_use]
1151  #[doc(alias("movemask", "move_mask"))]
1152  pub fn to_bitmask(self) -> u32 {
1153    pick! {
1154      if #[cfg(target_feature="avx512f")] {
1155        movepi32_mask_m512(self.avx512) as u32
1156      } else {
1157        (self.b.to_bitmask() << 8) | self.a.to_bitmask()
1158      }
1159    }
1160  }
1161
1162  #[inline]
1163  #[must_use]
1164  pub fn any(self) -> bool {
1165    pick! {
1166      if #[cfg(target_feature="avx512f")] {
1167        movepi32_mask_m512(self.avx512) != 0
1168      } else {
1169        self.a.any() || self.b.any()
1170      }
1171    }
1172  }
1173
1174  #[inline]
1175  #[must_use]
1176  pub fn all(self) -> bool {
1177    pick! {
1178      if #[cfg(target_feature="avx512f")] {
1179        movepi32_mask_m512(self.avx512) == !0_u16
1180      } else {
1181        self.a.all() && self.b.all()
1182      }
1183    }
1184  }
1185
1186  #[inline]
1187  #[must_use]
1188  pub fn none(self) -> bool {
1189    !self.any()
1190  }
1191
1192  #[inline]
1193  fn vm_pow2n(self) -> Self {
1194    const_f32_as_f32x16!(pow2_23, 8388608.0);
1195    const_f32_as_f32x16!(bias, 127.0);
1196    let a = self + (bias + pow2_23);
1197    let c = cast::<_, i32x16>(a) << 23;
1198    cast::<_, f32x16>(c)
1199  }
1200
1201  /// Calculate the exponent of a packed `f32x16`
1202  #[inline]
1203  #[must_use]
1204  pub fn exp(self) -> Self {
1205    const_f32_as_f32x16!(P0, 1.0 / 2.0);
1206    const_f32_as_f32x16!(P1, 1.0 / 6.0);
1207    const_f32_as_f32x16!(P2, 1. / 24.);
1208    const_f32_as_f32x16!(P3, 1. / 120.);
1209    const_f32_as_f32x16!(P4, 1. / 720.);
1210    const_f32_as_f32x16!(P5, 1. / 5040.);
1211    const_f32_as_f32x16!(LN2D_HI, 0.693359375);
1212    const_f32_as_f32x16!(LN2D_LO, -2.12194440e-4);
1213    let max_x = f32x16::from(87.3);
1214    let r = (self * Self::LOG2_E).round();
1215    let x = r.mul_neg_add(LN2D_HI, self);
1216    let x = r.mul_neg_add(LN2D_LO, x);
1217    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1218    let x2 = x * x;
1219    let z = z.mul_add(x2, x);
1220    let n2 = Self::vm_pow2n(r);
1221    let z = (z + Self::ONE) * n2;
1222    // check for overflow
1223    let in_range = self.abs().simd_lt(max_x);
1224    let in_range = in_range & self.is_finite();
1225    in_range.blend(z, Self::ZERO)
1226  }
1227
1228  #[inline]
1229  fn exponent(self) -> Self {
1230    const_f32_as_f32x16!(pow2_23, 8388608.0);
1231    const_f32_as_f32x16!(bias, 127.0);
1232    let a = cast::<_, u32x16>(self);
1233    let b = a >> 23;
1234    let c = b | cast::<_, u32x16>(pow2_23);
1235    let d = cast::<_, f32x16>(c);
1236    let e = d - (pow2_23 + bias);
1237    e
1238  }
1239
1240  #[inline]
1241  fn fraction_2(self) -> Self {
1242    let t1 = cast::<_, u32x16>(self);
1243    let t2 = cast::<_, u32x16>(
1244      (t1 & u32x16::from(0x007FFFFF)) | u32x16::from(0x3F000000),
1245    );
1246    cast::<_, f32x16>(t2)
1247  }
1248
1249  #[inline]
1250  fn is_zero_or_subnormal(self) -> Self {
1251    let t = cast::<_, i32x16>(self);
1252    let t = t & i32x16::splat(0x7F800000);
1253    i32x16::round_float(t.simd_eq(i32x16::splat(0)))
1254  }
1255
1256  #[inline]
1257  fn infinity() -> Self {
1258    cast::<_, f32x16>(i32x16::splat(0x7F800000))
1259  }
1260
1261  #[inline]
1262  fn nan_log() -> Self {
1263    cast::<_, f32x16>(i32x16::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1264  }
1265
1266  #[inline]
1267  fn nan_pow() -> Self {
1268    cast::<_, f32x16>(i32x16::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1269  }
1270
1271  #[inline]
1272  pub fn sign_bit(self) -> Self {
1273    let t1 = cast::<_, i32x16>(self);
1274    let t2 = t1 >> 31;
1275    !cast::<_, f32x16>(t2).simd_eq(f32x16::ZERO)
1276  }
1277
1278  /// horizontal add of all the elements of the vector
1279  #[inline]
1280  #[must_use]
1281  pub fn reduce_add(self) -> f32 {
1282    pick! {
1283      if #[cfg(target_feature="avx512f")]{
1284        reduce_add_m512(self.avx512)
1285      } else {
1286        self.a.reduce_add() + self.b.reduce_add()
1287      }
1288    }
1289  }
1290
1291  /// Natural log (ln(x))
1292  #[inline]
1293  #[must_use]
1294  pub fn ln(self) -> Self {
1295    const_f32_as_f32x16!(HALF, 0.5);
1296    const_f32_as_f32x16!(P0, 3.3333331174E-1);
1297    const_f32_as_f32x16!(P1, -2.4999993993E-1);
1298    const_f32_as_f32x16!(P2, 2.0000714765E-1);
1299    const_f32_as_f32x16!(P3, -1.6668057665E-1);
1300    const_f32_as_f32x16!(P4, 1.4249322787E-1);
1301    const_f32_as_f32x16!(P5, -1.2420140846E-1);
1302    const_f32_as_f32x16!(P6, 1.1676998740E-1);
1303    const_f32_as_f32x16!(P7, -1.1514610310E-1);
1304    const_f32_as_f32x16!(P8, 7.0376836292E-2);
1305    const_f32_as_f32x16!(LN2F_HI, 0.693359375);
1306    const_f32_as_f32x16!(LN2F_LO, -2.12194440e-4);
1307    const_f32_as_f32x16!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1308
1309    let x1 = self;
1310    let x = Self::fraction_2(x1);
1311    let e = Self::exponent(x1);
1312    let mask = x.simd_gt(Self::SQRT_2 * HALF);
1313    let x = (!mask).blend(x + x, x);
1314    let fe = mask.blend(e + Self::ONE, e);
1315    let x = x - Self::ONE;
1316    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1317    let x2 = x * x;
1318    let res = x2 * x * res;
1319    let res = fe.mul_add(LN2F_LO, res);
1320    let res = res + x2.mul_neg_add(HALF, x);
1321    let res = fe.mul_add(LN2F_HI, res);
1322    let overflow = !self.is_finite();
1323    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1324    let mask = overflow | underflow;
1325    if !mask.any() {
1326      res
1327    } else {
1328      let is_zero = self.is_zero_or_subnormal();
1329      let res = underflow.blend(Self::nan_log(), res);
1330      let res = is_zero.blend(Self::infinity(), res);
1331      let res = overflow.blend(self, res);
1332      res
1333    }
1334  }
1335
1336  #[inline]
1337  #[must_use]
1338  pub fn log2(self) -> Self {
1339    Self::ln(self) * Self::LOG2_E
1340  }
1341
1342  #[inline]
1343  #[must_use]
1344  pub fn log10(self) -> Self {
1345    Self::ln(self) * Self::LOG10_E
1346  }
1347
1348  #[inline]
1349  #[must_use]
1350  pub fn pow_f32x16(self, y: Self) -> Self {
1351    const_f32_as_f32x16!(ln2f_hi, 0.693359375);
1352    const_f32_as_f32x16!(ln2f_lo, -2.12194440e-4);
1353    const_f32_as_f32x16!(P0logf, 3.3333331174E-1);
1354    const_f32_as_f32x16!(P1logf, -2.4999993993E-1);
1355    const_f32_as_f32x16!(P2logf, 2.0000714765E-1);
1356    const_f32_as_f32x16!(P3logf, -1.6668057665E-1);
1357    const_f32_as_f32x16!(P4logf, 1.4249322787E-1);
1358    const_f32_as_f32x16!(P5logf, -1.2420140846E-1);
1359    const_f32_as_f32x16!(P6logf, 1.1676998740E-1);
1360    const_f32_as_f32x16!(P7logf, -1.1514610310E-1);
1361    const_f32_as_f32x16!(P8logf, 7.0376836292E-2);
1362
1363    const_f32_as_f32x16!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1364    const_f32_as_f32x16!(p3expf, 1.0 / 6.0);
1365    const_f32_as_f32x16!(p4expf, 1.0 / 24.0);
1366    const_f32_as_f32x16!(p5expf, 1.0 / 120.0);
1367    const_f32_as_f32x16!(p6expf, 1.0 / 720.0);
1368    const_f32_as_f32x16!(p7expf, 1.0 / 5040.0);
1369
1370    let x1 = self.abs();
1371    let x = x1.fraction_2();
1372    let mask = x.simd_gt(f32x16::SQRT_2 * f32x16::HALF);
1373    let x = (!mask).blend(x + x, x);
1374
1375    let x = x - f32x16::ONE;
1376    let x2 = x * x;
1377    let lg1 = polynomial_8!(
1378      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1379    );
1380    let lg1 = lg1 * x2 * x;
1381
1382    let ef = x1.exponent();
1383    let ef = mask.blend(ef + f32x16::ONE, ef);
1384    let e1 = (ef * y).round();
1385    let yr = ef.mul_sub(y, e1);
1386
1387    let lg = f32x16::HALF.mul_neg_add(x2, x) + lg1;
1388    let x2_err = (f32x16::HALF * x).mul_sub(x, f32x16::HALF * x2);
1389    let lg_err = f32x16::HALF.mul_add(x2, lg - x) - lg1;
1390
1391    let e2 = (lg * y * f32x16::LOG2_E).round();
1392    let v = lg.mul_sub(y, e2 * ln2f_hi);
1393    let v = e2.mul_neg_add(ln2f_lo, v);
1394    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x16::LN_2);
1395
1396    let x = v;
1397    let e3 = (x * f32x16::LOG2_E).round();
1398    let x = e3.mul_neg_add(f32x16::LN_2, x);
1399    let x2 = x * x;
1400    let z = x2.mul_add(
1401      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1402      x + f32x16::ONE,
1403    );
1404
1405    let ee = e1 + e2 + e3;
1406    let ei = cast::<_, i32x16>(ee.round_int());
1407    let ej = cast::<_, i32x16>(ei + (cast::<_, i32x16>(z) >> 23));
1408
1409    let overflow = cast::<_, f32x16>(ej.simd_gt(i32x16::splat(0x0FF)))
1410      | (ee.simd_gt(f32x16::splat(300.0)));
1411    let underflow = cast::<_, f32x16>(ej.simd_lt(i32x16::splat(0x000)))
1412      | (ee.simd_lt(f32x16::splat(-300.0)));
1413
1414    // Add exponent by integer addition
1415    let z = cast::<_, f32x16>(cast::<_, i32x16>(z) + (ei << 23));
1416    // Check for overflow/underflow
1417    let z = underflow.blend(f32x16::ZERO, z);
1418    let z = overflow.blend(Self::infinity(), z);
1419
1420    // Check for self == 0
1421    let x_zero = self.is_zero_or_subnormal();
1422    let z = x_zero.blend(
1423      y.simd_lt(f32x16::ZERO).blend(
1424        Self::infinity(),
1425        y.simd_eq(f32x16::ZERO).blend(f32x16::ONE, f32x16::ZERO),
1426      ),
1427      z,
1428    );
1429
1430    let x_sign = self.sign_bit();
1431    let z = if x_sign.any() {
1432      // Y into an integer
1433      let yi = y.simd_eq(y.round());
1434
1435      // Is y odd?
1436      let y_odd = cast::<_, i32x16>(y.round_int() << 31).round_float();
1437
1438      let z1 =
1439        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1440
1441      x_sign.blend(z1, z)
1442    } else {
1443      z
1444    };
1445
1446    let x_finite = self.is_finite();
1447    let y_finite = y.is_finite();
1448    let e_finite = ee.is_finite();
1449    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1450      return z;
1451    }
1452
1453    (self.is_nan() | y.is_nan()).blend(self + y, z)
1454  }
1455
1456  #[inline]
1457  pub fn powf(self, y: f32) -> Self {
1458    Self::pow_f32x16(self, f32x16::splat(y))
1459  }
1460
1461  /// Transpose matrix of 16x16 `f32` matrix. Currently not accelerated.
1462  #[must_use]
1463  #[inline]
1464  pub fn transpose(data: [f32x16; 16]) -> [f32x16; 16] {
1465    // TODO: Add `_mm512_unpackhi_ps` to `safe_arch`, looks like it is missing,
1466    // then try adding an optimized `avx512f` implementation.
1467
1468    #[inline(always)]
1469    fn transpose_column(data: &[f32x16; 16], index: usize) -> f32x16 {
1470      f32x16::new([
1471        data[0].as_array()[index],
1472        data[1].as_array()[index],
1473        data[2].as_array()[index],
1474        data[3].as_array()[index],
1475        data[4].as_array()[index],
1476        data[5].as_array()[index],
1477        data[6].as_array()[index],
1478        data[7].as_array()[index],
1479        data[8].as_array()[index],
1480        data[9].as_array()[index],
1481        data[10].as_array()[index],
1482        data[11].as_array()[index],
1483        data[12].as_array()[index],
1484        data[13].as_array()[index],
1485        data[14].as_array()[index],
1486        data[15].as_array()[index],
1487      ])
1488    }
1489
1490    [
1491      transpose_column(&data, 0),
1492      transpose_column(&data, 1),
1493      transpose_column(&data, 2),
1494      transpose_column(&data, 3),
1495      transpose_column(&data, 4),
1496      transpose_column(&data, 5),
1497      transpose_column(&data, 6),
1498      transpose_column(&data, 7),
1499      transpose_column(&data, 8),
1500      transpose_column(&data, 9),
1501      transpose_column(&data, 10),
1502      transpose_column(&data, 11),
1503      transpose_column(&data, 12),
1504      transpose_column(&data, 13),
1505      transpose_column(&data, 14),
1506      transpose_column(&data, 15),
1507    ]
1508  }
1509
1510  #[inline]
1511  #[must_use]
1512  pub fn to_array(self) -> [f32; 16] {
1513    cast(self)
1514  }
1515
1516  #[inline]
1517  #[must_use]
1518  pub fn as_array(&self) -> &[f32; 16] {
1519    cast_ref(self)
1520  }
1521
1522  #[inline]
1523  #[must_use]
1524  pub fn as_mut_array(&mut self) -> &mut [f32; 16] {
1525    cast_mut(self)
1526  }
1527
1528  #[inline]
1529  pub fn from_i32x16(v: i32x16) -> Self {
1530    pick! {
1531      if #[cfg(target_feature="avx512f")] {
1532        Self { avx512: convert_to_m512_from_i32_m512i(v.avx512) }
1533      } else {
1534        Self {
1535          a: f32x8::from_i32x8(v.a),
1536          b: f32x8::from_i32x8(v.b),
1537        }
1538      }
1539    }
1540  }
1541}
1542
1543impl Not for f32x16 {
1544  type Output = Self;
1545  #[inline]
1546  fn not(self) -> Self::Output {
1547    pick! {
1548      if #[cfg(target_feature="avx512f")] {
1549        Self { avx512: bitxor_m512(self.avx512, set_splat_m512(f32::from_bits(u32::MAX))) }
1550      } else {
1551        Self {
1552          a : self.a.not(),
1553          b : self.b.not(),
1554        }
1555      }
1556    }
1557  }
1558}