wide/
f32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(16))]
7    pub struct f32x4 { pub(crate) sse: m128 }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct f32x4 { pub(crate) simd: v128 }
14
15    impl Default for f32x4 {
16      fn default() -> Self {
17        Self::splat(0.0)
18      }
19    }
20
21    impl PartialEq for f32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(f32x4_eq(self.simd, other.simd))
24      }
25    }
26  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27    use core::arch::aarch64::*;
28    #[repr(C)]
29    #[derive(Copy, Clone)]
30    pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32    impl Default for f32x4 {
33      #[inline]
34      fn default() -> Self {
35        unsafe { Self { neon: vdupq_n_f32(0.0)} }
36      }
37    }
38
39    impl PartialEq for f32x4 {
40      #[inline]
41      fn eq(&self, other: &Self) -> bool {
42        unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
43      }
44
45    }
46    } else {
47    #[derive(Default, Clone, Copy, PartialEq)]
48    #[repr(C, align(16))]
49    pub struct f32x4 { pub(crate) arr: [f32;4] }
50  }
51}
52
53macro_rules! const_f32_as_f32x4 {
54  ($i:ident, $f:expr) => {
55    #[allow(non_upper_case_globals)]
56    pub const $i: f32x4 = f32x4::new([$f; 4]);
57  };
58}
59
60impl f32x4 {
61  const_f32_as_f32x4!(ONE, 1.0);
62  const_f32_as_f32x4!(ZERO, 0.0);
63  const_f32_as_f32x4!(HALF, 0.5);
64  const_f32_as_f32x4!(E, core::f32::consts::E);
65  const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
66  const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
67  const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
68  const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
69  const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
70  const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
71  const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
72  const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
73  const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
74  const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
75  const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
76  const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
77  const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
78  const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
79  const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
80  const_f32_as_f32x4!(PI, core::f32::consts::PI);
81  const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
82  const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
83}
84
85unsafe impl Zeroable for f32x4 {}
86unsafe impl Pod for f32x4 {}
87
88impl AlignTo for f32x4 {
89  type Elem = f32;
90}
91
92impl Add for f32x4 {
93  type Output = Self;
94  #[inline]
95  fn add(self, rhs: Self) -> Self::Output {
96    pick! {
97      if #[cfg(target_feature="sse")] {
98        Self { sse: add_m128(self.sse, rhs.sse) }
99      } else if #[cfg(target_feature="simd128")] {
100        Self { simd: f32x4_add(self.simd, rhs.simd) }
101      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102        unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
103      } else {
104        Self { arr: [
105          self.arr[0] + rhs.arr[0],
106          self.arr[1] + rhs.arr[1],
107          self.arr[2] + rhs.arr[2],
108          self.arr[3] + rhs.arr[3],
109        ]}
110      }
111    }
112  }
113}
114
115impl Sub for f32x4 {
116  type Output = Self;
117  #[inline]
118  fn sub(self, rhs: Self) -> Self::Output {
119    pick! {
120      if #[cfg(target_feature="sse")] {
121        Self { sse: sub_m128(self.sse, rhs.sse) }
122      } else if #[cfg(target_feature="simd128")] {
123        Self { simd: f32x4_sub(self.simd, rhs.simd) }
124      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125        unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126      } else {
127        Self { arr: [
128          self.arr[0] - rhs.arr[0],
129          self.arr[1] - rhs.arr[1],
130          self.arr[2] - rhs.arr[2],
131          self.arr[3] - rhs.arr[3],
132        ]}
133      }
134    }
135  }
136}
137
138impl Mul for f32x4 {
139  type Output = Self;
140  #[inline]
141  fn mul(self, rhs: Self) -> Self::Output {
142    pick! {
143      if #[cfg(target_feature="sse")] {
144        Self { sse: mul_m128(self.sse, rhs.sse) }
145      } else if #[cfg(target_feature="simd128")] {
146        Self { simd: f32x4_mul(self.simd, rhs.simd) }
147      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
148        unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
149      } else {
150        Self { arr: [
151          self.arr[0] * rhs.arr[0],
152          self.arr[1] * rhs.arr[1],
153          self.arr[2] * rhs.arr[2],
154          self.arr[3] * rhs.arr[3],
155        ]}
156      }
157    }
158  }
159}
160
161impl Div for f32x4 {
162  type Output = Self;
163  #[inline]
164  fn div(self, rhs: Self) -> Self::Output {
165    pick! {
166      if #[cfg(target_feature="sse")] {
167        Self { sse: div_m128(self.sse, rhs.sse) }
168      } else if #[cfg(target_feature="simd128")] {
169        Self { simd: f32x4_div(self.simd, rhs.simd) }
170      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
171        unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
172      } else {
173        Self { arr: [
174          self.arr[0] / rhs.arr[0],
175          self.arr[1] / rhs.arr[1],
176          self.arr[2] / rhs.arr[2],
177          self.arr[3] / rhs.arr[3],
178        ]}
179      }
180    }
181  }
182}
183
184impl Neg for f32x4 {
185  type Output = Self;
186  #[inline]
187  fn neg(self) -> Self::Output {
188    pick! {
189      if #[cfg(target_feature="sse")] {
190        Self { sse: bitxor_m128(self.sse, Self::splat(-0.0).sse) }
191      } else if #[cfg(target_feature="simd128")] {
192        Self { simd: f32x4_neg(self.simd) }
193      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
194        unsafe {Self { neon: vnegq_f32(self.neon) }}
195      } else {
196        Self { arr: [
197          -self.arr[0],
198          -self.arr[1],
199          -self.arr[2],
200          -self.arr[3],
201        ]}
202      }
203    }
204  }
205}
206
207impl Add<f32> for f32x4 {
208  type Output = Self;
209  #[inline]
210  fn add(self, rhs: f32) -> Self::Output {
211    self.add(Self::splat(rhs))
212  }
213}
214
215impl Sub<f32> for f32x4 {
216  type Output = Self;
217  #[inline]
218  fn sub(self, rhs: f32) -> Self::Output {
219    self.sub(Self::splat(rhs))
220  }
221}
222
223impl Mul<f32> for f32x4 {
224  type Output = Self;
225  #[inline]
226  fn mul(self, rhs: f32) -> Self::Output {
227    self.mul(Self::splat(rhs))
228  }
229}
230
231impl Div<f32> for f32x4 {
232  type Output = Self;
233  #[inline]
234  fn div(self, rhs: f32) -> Self::Output {
235    self.div(Self::splat(rhs))
236  }
237}
238
239impl Add<f32x4> for f32 {
240  type Output = f32x4;
241  #[inline]
242  fn add(self, rhs: f32x4) -> Self::Output {
243    f32x4::splat(self).add(rhs)
244  }
245}
246
247impl Sub<f32x4> for f32 {
248  type Output = f32x4;
249  #[inline]
250  fn sub(self, rhs: f32x4) -> Self::Output {
251    f32x4::splat(self).sub(rhs)
252  }
253}
254
255impl Mul<f32x4> for f32 {
256  type Output = f32x4;
257  #[inline]
258  fn mul(self, rhs: f32x4) -> Self::Output {
259    f32x4::splat(self).mul(rhs)
260  }
261}
262
263impl Div<f32x4> for f32 {
264  type Output = f32x4;
265  #[inline]
266  fn div(self, rhs: f32x4) -> Self::Output {
267    f32x4::splat(self).div(rhs)
268  }
269}
270
271impl BitAnd for f32x4 {
272  type Output = Self;
273  #[inline]
274  fn bitand(self, rhs: Self) -> Self::Output {
275    pick! {
276      if #[cfg(target_feature="sse")] {
277        Self { sse: bitand_m128(self.sse, rhs.sse) }
278      } else if #[cfg(target_feature="simd128")] {
279        Self { simd: v128_and(self.simd, rhs.simd) }
280      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
281        unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
282      } else {
283        Self { arr: [
284          f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
285          f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
286          f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
287          f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
288        ]}
289      }
290    }
291  }
292}
293
294impl BitOr for f32x4 {
295  type Output = Self;
296  #[inline]
297  fn bitor(self, rhs: Self) -> Self::Output {
298    pick! {
299      if #[cfg(target_feature="sse")] {
300        Self { sse: bitor_m128(self.sse, rhs.sse) }
301      } else if #[cfg(target_feature="simd128")] {
302        Self { simd: v128_or(self.simd, rhs.simd) }
303      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
304        unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
305      } else {
306        Self { arr: [
307          f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
308          f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
309          f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
310          f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
311        ]}
312      }
313    }
314  }
315}
316
317impl BitXor for f32x4 {
318  type Output = Self;
319  #[inline]
320  fn bitxor(self, rhs: Self) -> Self::Output {
321    pick! {
322      if #[cfg(target_feature="sse")] {
323        Self { sse: bitxor_m128(self.sse, rhs.sse) }
324      } else if #[cfg(target_feature="simd128")] {
325        Self { simd: v128_xor(self.simd, rhs.simd) }
326      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
327        unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
328      } else {
329        Self { arr: [
330          f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
331          f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
332          f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
333          f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
334        ]}
335      }
336    }
337  }
338}
339
340impl CmpEq for f32x4 {
341  type Output = Self;
342  #[inline]
343  fn simd_eq(self, rhs: Self) -> Self::Output {
344    pick! {
345      if #[cfg(target_feature="sse")] {
346        Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
347      } else if #[cfg(target_feature="simd128")] {
348        Self { simd: f32x4_eq(self.simd, rhs.simd) }
349      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
350        unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
351      } else {
352        Self { arr: [
353          if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
354          if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
355          if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
356          if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
357        ]}
358      }
359    }
360  }
361}
362
363impl CmpGe for f32x4 {
364  type Output = Self;
365  #[inline]
366  fn simd_ge(self, rhs: Self) -> Self::Output {
367    pick! {
368      if #[cfg(target_feature="sse")] {
369        Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
370      } else if #[cfg(target_feature="simd128")] {
371        Self { simd: f32x4_ge(self.simd, rhs.simd) }
372      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
373        unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
374      } else {
375        Self { arr: [
376          if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
377          if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
378          if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
379          if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
380        ]}
381      }
382    }
383  }
384}
385
386impl CmpGt for f32x4 {
387  type Output = Self;
388  #[inline]
389  fn simd_gt(self, rhs: Self) -> Self::Output {
390    pick! {
391      if #[cfg(target_feature="sse")] {
392        Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
393      } else if #[cfg(target_feature="simd128")] {
394        Self { simd: f32x4_gt(self.simd, rhs.simd) }
395      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396        unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
397      } else {
398        Self { arr: [
399          if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
400          if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
401          if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
402          if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
403        ]}
404      }
405    }
406  }
407}
408
409impl CmpNe for f32x4 {
410  type Output = Self;
411  #[inline]
412  fn simd_ne(self, rhs: Self) -> Self::Output {
413    pick! {
414      if #[cfg(target_feature="sse")] {
415        Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
416      } else if #[cfg(target_feature="simd128")] {
417        Self { simd: f32x4_ne(self.simd, rhs.simd) }
418      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
419        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
420      } else {
421        Self { arr: [
422          if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
423          if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
424          if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
425          if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
426        ]}
427      }
428    }
429  }
430}
431
432impl CmpLe for f32x4 {
433  type Output = Self;
434  #[inline]
435  fn simd_le(self, rhs: Self) -> Self::Output {
436    pick! {
437      if #[cfg(target_feature="sse")] {
438        Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
439      } else if #[cfg(target_feature="simd128")] {
440        Self { simd: f32x4_le(self.simd, rhs.simd) }
441      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
442        unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
443      } else {
444        Self { arr: [
445          if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
446          if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
447          if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
448          if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
449        ]}
450      }
451    }
452  }
453}
454
455impl CmpLt for f32x4 {
456  type Output = Self;
457  #[inline]
458  fn simd_lt(self, rhs: Self) -> Self::Output {
459    pick! {
460      if #[cfg(target_feature="sse")] {
461        Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
462      } else if #[cfg(target_feature="simd128")] {
463        Self { simd: f32x4_lt(self.simd, rhs.simd) }
464      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
465        unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
466      } else {
467        Self { arr: [
468          if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
469          if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
470          if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
471          if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
472        ]}
473      }
474    }
475  }
476}
477
478impl f32x4 {
479  #[inline]
480  #[must_use]
481  pub const fn new(array: [f32; 4]) -> Self {
482    #[allow(non_upper_case_globals)]
483    unsafe {
484      core::mem::transmute(array)
485    }
486  }
487
488  #[inline]
489  #[must_use]
490  pub fn blend(self, t: Self, f: Self) -> Self {
491    pick! {
492      if #[cfg(target_feature="sse4.1")] {
493        Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
494      } else if #[cfg(target_feature="simd128")] {
495        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
496      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
497        unsafe {Self { neon: vbslq_f32(vreinterpretq_u32_f32(self.neon), t.neon, f.neon) }}
498      } else {
499        generic_bit_blend(self, t, f)
500      }
501    }
502  }
503  #[inline]
504  #[must_use]
505  pub fn abs(self) -> Self {
506    pick! {
507      if #[cfg(target_feature="simd128")] {
508        Self { simd: f32x4_abs(self.simd) }
509      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
510        unsafe {Self { neon: vabsq_f32(self.neon) }}
511      } else {
512        let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
513        self & non_sign_bits
514      }
515    }
516  }
517  #[inline]
518  #[must_use]
519  pub fn floor(self) -> Self {
520    pick! {
521      if #[cfg(target_feature="simd128")] {
522        Self { simd: f32x4_floor(self.simd) }
523      } else if #[cfg(target_feature="sse4.1")] {
524        Self { sse: floor_m128(self.sse) }
525      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
526        unsafe {Self { neon: vrndmq_f32(self.neon) }}
527      } else if #[cfg(feature="std")] {
528        let base: [f32; 4] = cast(self);
529        cast(base.map(|val| val.floor()))
530      } else {
531        let base: [f32; 4] = cast(self);
532        let rounded: [f32; 4] = cast(self.round());
533        cast([
534          if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] },
535          if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] },
536          if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] },
537          if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] },
538        ])
539      }
540    }
541  }
542  #[inline]
543  #[must_use]
544  pub fn ceil(self) -> Self {
545    pick! {
546      if #[cfg(target_feature="simd128")] {
547        Self { simd: f32x4_ceil(self.simd) }
548      } else if #[cfg(target_feature="sse4.1")] {
549        Self { sse: ceil_m128(self.sse) }
550      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
551        unsafe {Self { neon: vrndpq_f32(self.neon) }}
552      } else if #[cfg(feature="std")] {
553        let base: [f32; 4] = cast(self);
554        cast(base.map(|val| val.ceil()))
555      } else {
556        let base: [f32; 4] = cast(self);
557        let rounded: [f32; 4] = cast(self.round());
558        cast([
559          if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] },
560          if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] },
561          if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] },
562          if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] },
563        ])
564      }
565    }
566  }
567
568  /// Calculates the lanewise maximum of both vectors. This is a faster
569  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
570  /// involved.
571  #[inline]
572  #[must_use]
573  pub fn fast_max(self, rhs: Self) -> Self {
574    pick! {
575      if #[cfg(target_feature="sse")] {
576        Self { sse: max_m128(self.sse, rhs.sse) }
577      } else if #[cfg(target_feature="simd128")] {
578        Self {
579          simd: f32x4_pmax(self.simd, rhs.simd),
580        }
581      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
582        unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
583      } else {
584        Self { arr: [
585          if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
586          if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
587          if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
588          if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
589        ]}
590      }
591    }
592  }
593
594  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
595  /// the other lane gets chosen. Use `fast_max` for a faster implementation
596  /// that doesn't handle NaNs.
597  #[inline]
598  #[must_use]
599  pub fn max(self, rhs: Self) -> Self {
600    pick! {
601      if #[cfg(target_feature="sse")] {
602        // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN
603        // involved, it chooses rhs, so we need to specifically check rhs for
604        // NaN.
605        rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
606      } else if #[cfg(target_feature="simd128")] {
607        // WASM has two max intrinsics:
608        // - max: This propagates NaN, that's the opposite of what we need.
609        // - pmax: This is defined as self < rhs ? rhs : self, which basically
610        //   chooses self if either is NaN.
611        //
612        // pmax is what we want, but we need to specifically check self for NaN.
613        Self {
614          simd: v128_bitselect(
615            rhs.simd,
616            f32x4_pmax(self.simd, rhs.simd),
617            f32x4_ne(self.simd, self.simd), // NaN check
618          )
619        }
620      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
621        unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
622      } else {
623        Self { arr: [
624          self.arr[0].max(rhs.arr[0]),
625          self.arr[1].max(rhs.arr[1]),
626          self.arr[2].max(rhs.arr[2]),
627          self.arr[3].max(rhs.arr[3]),
628        ]}
629      }
630    }
631  }
632
633  /// Calculates the lanewise minimum of both vectors. This is a faster
634  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
635  /// involved.
636  #[inline]
637  #[must_use]
638  pub fn fast_min(self, rhs: Self) -> Self {
639    pick! {
640      if #[cfg(target_feature="sse")] {
641        Self { sse: min_m128(self.sse, rhs.sse) }
642      } else if #[cfg(target_feature="simd128")] {
643        Self {
644          simd: f32x4_pmin(self.simd, rhs.simd),
645        }
646      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
647        unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
648      } else {
649        Self { arr: [
650          if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
651          if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
652          if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
653          if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
654        ]}
655      }
656    }
657  }
658
659  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
660  /// the other lane gets chosen. Use `fast_min` for a faster implementation
661  /// that doesn't handle NaNs.
662  #[inline]
663  #[must_use]
664  pub fn min(self, rhs: Self) -> Self {
665    pick! {
666      if #[cfg(target_feature="sse")] {
667        // min_m128 seems to do self < rhs ? self : rhs. So if there's any NaN
668        // involved, it chooses rhs, so we need to specifically check rhs for
669        // NaN.
670        rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
671      } else if #[cfg(target_feature="simd128")] {
672        // WASM has two min intrinsics:
673        // - min: This propagates NaN, that's the opposite of what we need.
674        // - pmin: This is defined as rhs < self ? rhs : self, which basically
675        //   chooses self if either is NaN.
676        //
677        // pmin is what we want, but we need to specifically check self for NaN.
678        Self {
679          simd: v128_bitselect(
680            rhs.simd,
681            f32x4_pmin(self.simd, rhs.simd),
682            f32x4_ne(self.simd, self.simd), // NaN check
683          )
684        }
685      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
686        unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
687      } else {
688        Self { arr: [
689          self.arr[0].min(rhs.arr[0]),
690          self.arr[1].min(rhs.arr[1]),
691          self.arr[2].min(rhs.arr[2]),
692          self.arr[3].min(rhs.arr[3]),
693        ]}
694      }
695    }
696  }
697  #[inline]
698  #[must_use]
699  pub fn is_nan(self) -> Self {
700    pick! {
701      if #[cfg(target_feature="sse")] {
702        Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
703      } else if #[cfg(target_feature="simd128")] {
704        Self { simd: f32x4_ne(self.simd, self.simd) }
705      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
706        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
707      } else {
708        Self { arr: [
709          if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
710          if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
711          if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
712          if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
713        ]}
714      }
715    }
716  }
717  #[inline]
718  #[must_use]
719  pub fn is_finite(self) -> Self {
720    let shifted_exp_mask = u32x4::from(0xFF000000);
721    let u: u32x4 = cast(self);
722    let shift_u = u << 1_u64;
723    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
724    cast(out)
725  }
726  #[inline]
727  #[must_use]
728  pub fn is_inf(self) -> Self {
729    let shifted_inf = u32x4::from(0xFF000000);
730    let u: u32x4 = cast(self);
731    let shift_u = u << 1_u64;
732    let out = (shift_u).simd_eq(shifted_inf);
733    cast(out)
734  }
735
736  #[inline]
737  #[must_use]
738  pub fn round(self) -> Self {
739    pick! {
740      if #[cfg(target_feature="sse4.1")] {
741        Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
742      } else if #[cfg(target_feature="sse2")] {
743        let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
744        let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
745        let i: i32x4 = cast(mi);
746        let mask: f32x4 = cast(i.simd_eq(i32x4::from(0x80000000_u32 as i32)));
747        mask.blend(self, f)
748      } else if #[cfg(target_feature="simd128")] {
749        Self { simd: f32x4_nearest(self.simd) }
750      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
751        unsafe {Self { neon: vrndnq_f32(self.neon) }}
752      } else {
753        // Note(Lokathor): This software fallback is probably very slow compared
754        // to having a hardware option available, even just the sse2 version is
755        // better than this. Oh well.
756        let to_int = f32x4::from(1.0 / f32::EPSILON);
757        let u: u32x4 = cast(self);
758        let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
759        let mut y: f32x4;
760
761        let no_op_magic = i32x4::from(0x7f + 23);
762        let no_op_mask: f32x4 = cast(e.simd_gt(no_op_magic) | e.simd_eq(no_op_magic));
763        let no_op_val: f32x4 = self;
764
765        let zero_magic = i32x4::from(0x7f - 1);
766        let zero_mask: f32x4 = cast(e.simd_lt(zero_magic));
767        let zero_val: f32x4 = self * f32x4::from(0.0);
768
769        let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).simd_lt(i32x4::default()));
770        let x: f32x4 = neg_bit.blend(-self, self);
771        y = x + to_int - to_int - x;
772        y = y.simd_gt(f32x4::from(0.5)).blend(
773          y + x - f32x4::from(-1.0),
774          y.simd_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
775        );
776        y = neg_bit.blend(-y, y);
777
778        no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
779      }
780    }
781  }
782
783  /// Rounds each lane into an integer. This is a faster implementation than
784  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
785  /// values you get implementation defined behavior.
786  #[inline]
787  #[must_use]
788  pub fn fast_round_int(self) -> i32x4 {
789    pick! {
790      if #[cfg(target_feature="sse2")] {
791        cast(convert_to_i32_m128i_from_m128(self.sse))
792      } else {
793        self.round_int()
794      }
795    }
796  }
797
798  /// Rounds each lane into an integer. This saturates out of range values and
799  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
800  /// doesn't handle out of range values or NaNs.
801  #[inline]
802  #[must_use]
803  pub fn round_int(self) -> i32x4 {
804    pick! {
805      if #[cfg(target_feature="sse2")] {
806        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
807        let non_nan_mask = self.simd_eq(self);
808        let non_nan = self & non_nan_mask;
809        let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
810        let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
811        flip_to_max ^ cast
812      } else if #[cfg(target_feature="simd128")] {
813        cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
814      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
815        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
816      } else {
817        let rounded: [f32; 4] = cast(self.round());
818        cast([
819          rounded[0] as i32,
820          rounded[1] as i32,
821          rounded[2] as i32,
822          rounded[3] as i32,
823        ])
824      }
825    }
826  }
827
828  /// Truncates each lane into an integer. This is a faster implementation than
829  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
830  /// values you get implementation defined behavior.
831  #[inline]
832  #[must_use]
833  pub fn fast_trunc_int(self) -> i32x4 {
834    pick! {
835      if #[cfg(target_feature="sse2")] {
836        cast(truncate_m128_to_m128i(self.sse))
837      } else {
838        self.trunc_int()
839      }
840    }
841  }
842
843  /// Truncates each lane into an integer. This saturates out of range values
844  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
845  /// that doesn't handle out of range values or NaNs.
846  #[inline]
847  #[must_use]
848  pub fn trunc_int(self) -> i32x4 {
849    pick! {
850      if #[cfg(target_feature="sse2")] {
851        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
852        let non_nan_mask = self.simd_eq(self);
853        let non_nan = self & non_nan_mask;
854        let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
855        let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
856        flip_to_max ^ cast
857      } else if #[cfg(target_feature="simd128")] {
858        cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
859      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
860        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
861      } else {
862        let n: [f32;4] = cast(self);
863        cast([
864          n[0] as i32,
865          n[1] as i32,
866          n[2] as i32,
867          n[3] as i32,
868        ])
869      }
870    }
871  }
872  /// Performs a multiply-add operation: `self * m + a`
873  ///
874  /// When hardware FMA support is available, this computes the result with a
875  /// single rounding operation. Without FMA support, it falls back to separate
876  /// multiply and add operations with two roundings.
877  ///
878  /// # Platform-specific behavior
879  /// - On `x86`/`x86_64` with FMA: Uses `vfmadd` (single rounding, best
880  ///   accuracy)
881  /// - On ARM64 with NEON: Uses `vfmaq_f32` (single rounding, best accuracy)
882  /// - Without FMA support: Uses `(self * m) + a` (two roundings)
883  ///
884  /// # Examples
885  /// ```
886  /// # use wide::f32x4;
887  /// let a = f32x4::from([1.0, 2.0, 3.0, 4.0]);
888  /// let b = f32x4::from([5.0, 6.0, 7.0, 8.0]);
889  /// let c = f32x4::from([9.0, 10.0, 11.0, 12.0]);
890  ///
891  /// let result = a.mul_add(b, c);
892  ///
893  /// let expected = f32x4::from([14.0, 22.0, 32.0, 44.0]);
894  /// assert_eq!(result, expected);
895  /// ```
896  #[inline]
897  #[must_use]
898  pub fn mul_add(self, m: Self, a: Self) -> Self {
899    pick! {
900      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
901        Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
902      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
903        unsafe { Self { neon: vfmaq_f32(a.neon, self.neon, m.neon) } }
904      } else {
905        (self * m) + a
906      }
907    }
908  }
909
910  /// Performs a multiply-subtract operation: `self * m - s`
911  ///
912  /// When hardware FMA support is available, this computes the result with a
913  /// single rounding operation. Without FMA support, it falls back to separate
914  /// multiply and subtract operations with two roundings.
915  ///
916  /// # Platform-specific behavior
917  /// - On `x86`/`x86_64` with FMA: Uses `vfmsub` (single rounding, best
918  ///   accuracy)
919  /// - On ARM64 with NEON: Uses `vfmaq_f32(-s, self, m)` (single rounding, best
920  ///   accuracy)
921  /// - Without FMA support: Uses `(self * m) - s` (two roundings)
922  ///
923  /// # Examples
924  /// ```
925  /// # use wide::f32x4;
926  /// let a = f32x4::from([10.0, 20.0, 30.0, 40.0]);
927  /// let b = f32x4::from([2.0, 3.0, 4.0, 5.0]);
928  /// let c = f32x4::from([5.0, 10.0, 15.0, 20.0]);
929  ///
930  /// let result = a.mul_sub(b, c);
931  ///
932  /// let expected = f32x4::from([15.0, 50.0, 105.0, 180.0]);
933  /// assert_eq!(result, expected);
934  /// ```
935  #[inline]
936  #[must_use]
937  pub fn mul_sub(self, m: Self, s: Self) -> Self {
938    pick! {
939      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
940        Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
941      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
942        unsafe { Self { neon: vfmaq_f32(vnegq_f32(s.neon), self.neon, m.neon) } }
943      } else {
944        (self * m) - s
945      }
946    }
947  }
948
949  /// Performs a negative multiply-add operation: `a - (self * m)`
950  ///
951  /// When hardware FMA support is available, this computes the result with a
952  /// single rounding operation. Without FMA support, it falls back to separate
953  /// operations with two roundings.
954  ///
955  /// # Platform-specific behavior
956  /// - On `x86`/`x86_64` with FMA: Uses `vfnmadd` (single rounding, best
957  ///   accuracy)
958  /// - On ARM64 with NEON: Uses `vfmsq_f32` (single rounding, best accuracy)
959  /// - Without FMA support: Uses `a - (self * m)` (two roundings)
960  ///
961  /// # Examples
962  /// ```
963  /// # use wide::f32x4;
964  /// let a = f32x4::from([3.0, 4.0, 5.0, 6.0]);
965  /// let b = f32x4::from([2.0, 2.0, 2.0, 2.0]);
966  /// let c = f32x4::from([10.0, 20.0, 30.0, 40.0]);
967  ///
968  /// let result = a.mul_neg_add(b, c);
969  ///
970  /// let expected = f32x4::from([4.0, 12.0, 20.0, 28.0]);
971  /// assert_eq!(result, expected);
972  /// ```
973  #[inline]
974  #[must_use]
975  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
976    pick! {
977      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
978        Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
979      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
980        unsafe { Self { neon: vfmsq_f32(a.neon, self.neon, m.neon) } }
981      } else {
982        a - (self * m)
983      }
984    }
985  }
986
987  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
988  ///
989  /// When hardware FMA support is available, this computes the result with a
990  /// single rounding operation. Without FMA support, it falls back to separate
991  /// operations with two roundings.
992  ///
993  /// # Platform-specific behavior
994  /// - On `x86`/`x86_64` with FMA: Uses `vfnmsub` (single rounding, best
995  ///   accuracy)
996  /// - On ARM64 with NEON: Uses `-(vfmaq_f32(s, self, m))` (single rounding,
997  ///   best accuracy)
998  /// - Without FMA support: Uses `-(self * m) - s` (two roundings)
999  ///
1000  /// # Examples
1001  /// ```
1002  /// # use wide::f32x4;
1003  /// let a = f32x4::from([3.0, 4.0, 5.0, 6.0]);
1004  /// let b = f32x4::from([2.0, 2.0, 2.0, 2.0]);
1005  /// let c = f32x4::from([1.0, 2.0, 3.0, 4.0]);
1006  ///
1007  /// let result = a.mul_neg_sub(b, c);
1008  ///
1009  /// let expected = f32x4::from([-7.0, -10.0, -13.0, -16.0]);
1010  /// assert_eq!(result, expected);
1011  /// ```
1012  #[inline]
1013  #[must_use]
1014  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
1015    pick! {
1016      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
1017        Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, s.sse) }
1018      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1019        unsafe { Self { neon: vnegq_f32(vfmaq_f32(s.neon, self.neon, m.neon)) } }
1020      } else {
1021        -(self * m) - s
1022      }
1023    }
1024  }
1025
1026  #[inline]
1027  #[must_use]
1028  pub fn flip_signs(self, signs: Self) -> Self {
1029    self ^ (signs & Self::from(-0.0))
1030  }
1031
1032  #[inline]
1033  #[must_use]
1034  pub fn copysign(self, sign: Self) -> Self {
1035    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
1036    (self & magnitude_mask) | (sign & Self::from(-0.0))
1037  }
1038
1039  #[inline]
1040  pub fn asin_acos(self) -> (Self, Self) {
1041    // Based on the Agner Fog "vector class library":
1042    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1043    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1044    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1045    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1046    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1047    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1048
1049    let xa = self.abs();
1050    let big = xa.simd_ge(f32x4::splat(0.5));
1051
1052    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1053    let x2 = xa * xa;
1054    let x3 = big.blend(x1, x2);
1055
1056    let xb = x1.sqrt();
1057
1058    let x4 = big.blend(xb, xa);
1059
1060    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1061    let z = z.mul_add(x3 * x4, x4);
1062
1063    let z1 = z + z;
1064
1065    // acos
1066    let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1067    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1068    let acos = big.blend(z3, z4);
1069
1070    // asin
1071    let z3 = f32x4::FRAC_PI_2 - z1;
1072    let asin = big.blend(z3, z);
1073    let asin = asin.flip_signs(self);
1074
1075    (asin, acos)
1076  }
1077
1078  #[inline]
1079  pub fn asin(self) -> Self {
1080    // Based on the Agner Fog "vector class library":
1081    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1082    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1083    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1084    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1085    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1086    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1087
1088    let xa = self.abs();
1089    let big = xa.simd_ge(f32x4::splat(0.5));
1090
1091    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1092    let x2 = xa * xa;
1093    let x3 = big.blend(x1, x2);
1094
1095    let xb = x1.sqrt();
1096
1097    let x4 = big.blend(xb, xa);
1098
1099    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1100    let z = z.mul_add(x3 * x4, x4);
1101
1102    let z1 = z + z;
1103
1104    // asin
1105    let z3 = f32x4::FRAC_PI_2 - z1;
1106    let asin = big.blend(z3, z);
1107    let asin = asin.flip_signs(self);
1108
1109    asin
1110  }
1111
1112  #[inline]
1113  #[must_use]
1114  pub fn acos(self) -> Self {
1115    // Based on the Agner Fog "vector class library":
1116    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1117    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1118    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1119    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1120    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1121    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1122
1123    let xa = self.abs();
1124    let big = xa.simd_ge(f32x4::splat(0.5));
1125
1126    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1127    let x2 = xa * xa;
1128    let x3 = big.blend(x1, x2);
1129
1130    let xb = x1.sqrt();
1131
1132    let x4 = big.blend(xb, xa);
1133
1134    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1135    let z = z.mul_add(x3 * x4, x4);
1136
1137    let z1 = z + z;
1138
1139    // acos
1140    let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1141    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1142    let acos = big.blend(z3, z4);
1143
1144    acos
1145  }
1146
1147  #[inline]
1148  pub fn atan(self) -> Self {
1149    // Based on the Agner Fog "vector class library":
1150    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1151    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1152    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1153    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1154    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1155
1156    let t = self.abs();
1157
1158    // small:  z = t / 1.0;
1159    // medium: z = (t-1.0) / (t+1.0);
1160    // big:    z = -1.0 / t;
1161    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1162    let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
1163
1164    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1165    s = notsmal & s;
1166
1167    let mut a = notbig & t;
1168    a = notsmal.blend(a - Self::ONE, a);
1169    let mut b = notbig & Self::ONE;
1170    b = notsmal.blend(b + t, b);
1171    let z = a / b;
1172
1173    let zz = z * z;
1174
1175    // Taylor expansion
1176    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1177    re = re.mul_add(zz * z, z) + s;
1178
1179    // get sign bit
1180    re = (self.sign_bit()).blend(-re, re);
1181
1182    re
1183  }
1184
1185  #[inline]
1186  pub fn atan2(self, x: Self) -> Self {
1187    // Based on the Agner Fog "vector class library":
1188    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1189    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1190    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1191    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1192    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1193
1194    let y = self;
1195
1196    // move in first octant
1197    let x1 = x.abs();
1198    let y1 = y.abs();
1199    let swapxy = y1.simd_gt(x1);
1200    // swap x and y if y1 > x1
1201    let mut x2 = swapxy.blend(y1, x1);
1202    let mut y2 = swapxy.blend(x1, y1);
1203
1204    // check for special case: x and y are both +/- INF
1205    let both_infinite = x.is_inf() & y.is_inf();
1206    if both_infinite.any() {
1207      let minus_one = -Self::ONE;
1208      x2 = both_infinite.blend(x2 & minus_one, x2);
1209      y2 = both_infinite.blend(y2 & minus_one, y2);
1210    }
1211
1212    // x = y = 0 will produce NAN. No problem, fixed below
1213    let t = y2 / x2;
1214
1215    // small:  z = t / 1.0;
1216    // medium: z = (t-1.0) / (t+1.0);
1217    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1218
1219    let a = notsmal.blend(t - Self::ONE, t);
1220    let b = notsmal.blend(t + Self::ONE, Self::ONE);
1221    let s = notsmal & Self::FRAC_PI_4;
1222    let z = a / b;
1223
1224    let zz = z * z;
1225
1226    // Taylor expansion
1227    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1228    re = re.mul_add(zz * z, z) + s;
1229
1230    // move back in place
1231    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1232    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1233    re = (x.sign_bit()).blend(Self::PI - re, re);
1234
1235    // get sign bit
1236    re = (y.sign_bit()).blend(-re, re);
1237
1238    re
1239  }
1240
1241  #[inline]
1242  #[must_use]
1243  pub fn sin_cos(self) -> (Self, Self) {
1244    // Based on the Agner Fog "vector class library":
1245    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1246
1247    const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1248    const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1249    const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1250
1251    const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1252    const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1253    const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1254
1255    const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1256    const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1257    const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1258
1259    const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1260
1261    let xa = self.abs();
1262
1263    // Find quadrant
1264    let y = (xa * TWO_OVER_PI).round();
1265    let q: i32x4 = y.round_int();
1266
1267    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1268
1269    let x2 = x * x;
1270    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1271    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1272      + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1273
1274    let swap = !(q & i32x4::from(1)).simd_eq(i32x4::from(0));
1275
1276    let mut overflow: f32x4 = cast(q.simd_gt(i32x4::from(0x2000000)));
1277    overflow &= xa.is_finite();
1278    s = overflow.blend(f32x4::from(0.0), s);
1279    c = overflow.blend(f32x4::from(1.0), c);
1280
1281    // calc sin
1282    let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1283    let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1284    sin1 = sin1.flip_signs(cast(sign_sin));
1285
1286    // calc cos
1287    let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1288    let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1289    cos1 ^= cast::<_, f32x4>(sign_cos);
1290
1291    (sin1, cos1)
1292  }
1293
1294  #[inline]
1295  #[must_use]
1296  pub fn sin(self) -> Self {
1297    let (s, _) = self.sin_cos();
1298    s
1299  }
1300  #[inline]
1301  #[must_use]
1302  pub fn cos(self) -> Self {
1303    let (_, c) = self.sin_cos();
1304    c
1305  }
1306  #[inline]
1307  #[must_use]
1308  pub fn tan(self) -> Self {
1309    let (s, c) = self.sin_cos();
1310    s / c
1311  }
1312  #[inline]
1313  #[must_use]
1314  pub fn to_degrees(self) -> Self {
1315    const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1316    self * RAD_TO_DEG_RATIO
1317  }
1318  #[inline]
1319  #[must_use]
1320  pub fn to_radians(self) -> Self {
1321    const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1322    self * DEG_TO_RAD_RATIO
1323  }
1324  #[inline]
1325  #[must_use]
1326  pub fn recip(self) -> Self {
1327    pick! {
1328      if #[cfg(target_feature="sse")] {
1329        Self { sse: reciprocal_m128(self.sse) }
1330      } else if #[cfg(target_feature="simd128")] {
1331        Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1332      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1333        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1334      } else {
1335        Self { arr: [
1336          1.0 / self.arr[0],
1337          1.0 / self.arr[1],
1338          1.0 / self.arr[2],
1339          1.0 / self.arr[3],
1340        ]}
1341      }
1342    }
1343  }
1344  #[inline]
1345  #[must_use]
1346  pub fn recip_sqrt(self) -> Self {
1347    pick! {
1348      if #[cfg(target_feature="sse")] {
1349        Self { sse: reciprocal_sqrt_m128(self.sse) }
1350      } else if #[cfg(target_feature="simd128")] {
1351        Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1352      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1353        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1354      } else if #[cfg(feature="std")] {
1355        Self { arr: [
1356          1.0 / self.arr[0].sqrt(),
1357          1.0 / self.arr[1].sqrt(),
1358          1.0 / self.arr[2].sqrt(),
1359          1.0 / self.arr[3].sqrt(),
1360        ]}
1361      } else {
1362        Self { arr: [
1363          1.0 / software_sqrt(self.arr[0] as f64) as f32,
1364          1.0 / software_sqrt(self.arr[1] as f64) as f32,
1365          1.0 / software_sqrt(self.arr[2] as f64) as f32,
1366          1.0 / software_sqrt(self.arr[3] as f64) as f32,
1367        ]}
1368      }
1369    }
1370  }
1371  #[inline]
1372  #[must_use]
1373  pub fn sqrt(self) -> Self {
1374    pick! {
1375      if #[cfg(target_feature="sse")] {
1376        Self { sse: sqrt_m128(self.sse) }
1377      } else if #[cfg(target_feature="simd128")] {
1378        Self { simd: f32x4_sqrt(self.simd) }
1379      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1380        unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1381      } else if #[cfg(feature="std")] {
1382        Self { arr: [
1383          self.arr[0].sqrt(),
1384          self.arr[1].sqrt(),
1385          self.arr[2].sqrt(),
1386          self.arr[3].sqrt(),
1387        ]}
1388      } else {
1389        Self { arr: [
1390          software_sqrt(self.arr[0] as f64) as f32,
1391          software_sqrt(self.arr[1] as f64) as f32,
1392          software_sqrt(self.arr[2] as f64) as f32,
1393          software_sqrt(self.arr[3] as f64) as f32,
1394        ]}
1395      }
1396    }
1397  }
1398
1399  #[inline]
1400  #[must_use]
1401  #[doc(alias("movemask", "move_mask"))]
1402  pub fn to_bitmask(self) -> u32 {
1403    pick! {
1404      if #[cfg(target_feature="sse")] {
1405        move_mask_m128(self.sse) as u32
1406      } else if #[cfg(target_feature="simd128")] {
1407        u32x4_bitmask(self.simd) as u32
1408      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1409        unsafe
1410        {
1411          // set all to 1 if top bit is set, else 0
1412          let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1413
1414          // select the right bit out of each lane
1415          let selectbit : uint32x4_t = core::mem::transmute([1u32, 2, 4, 8]);
1416          let r = vandq_u32(masked, selectbit);
1417
1418          // horizontally add the 16-bit lanes
1419          vaddvq_u32(r) as u32
1420        }
1421      } else {
1422        (((self.arr[0].to_bits() as i32) < 0) as u32) << 0 |
1423        (((self.arr[1].to_bits() as i32) < 0) as u32) << 1 |
1424        (((self.arr[2].to_bits() as i32) < 0) as u32) << 2 |
1425        (((self.arr[3].to_bits() as i32) < 0) as u32) << 3
1426      }
1427    }
1428  }
1429  #[inline]
1430  #[must_use]
1431  pub fn any(self) -> bool {
1432    pick! {
1433      if #[cfg(target_feature="simd128")] {
1434        v128_any_true(self.simd)
1435      } else {
1436        self.to_bitmask() != 0
1437      }
1438    }
1439  }
1440  #[inline]
1441  #[must_use]
1442  pub fn all(self) -> bool {
1443    pick! {
1444      if #[cfg(target_feature="simd128")] {
1445        u32x4_all_true(self.simd)
1446      } else {
1447        // four lanes
1448        self.to_bitmask() == 0b1111
1449      }
1450    }
1451  }
1452  #[inline]
1453  #[must_use]
1454  pub fn none(self) -> bool {
1455    !self.any()
1456  }
1457
1458  #[inline]
1459  fn vm_pow2n(self) -> Self {
1460    const_f32_as_f32x4!(pow2_23, 8388608.0);
1461    const_f32_as_f32x4!(bias, 127.0);
1462    let a = self + (bias + pow2_23);
1463    let c = cast::<_, i32x4>(a) << 23;
1464    cast::<_, f32x4>(c)
1465  }
1466
1467  /// Calculate the exponent of a packed `f32x4`
1468  #[inline]
1469  #[must_use]
1470  pub fn exp(self) -> Self {
1471    const_f32_as_f32x4!(P0, 1.0 / 2.0);
1472    const_f32_as_f32x4!(P1, 1.0 / 6.0);
1473    const_f32_as_f32x4!(P2, 1. / 24.);
1474    const_f32_as_f32x4!(P3, 1. / 120.);
1475    const_f32_as_f32x4!(P4, 1. / 720.);
1476    const_f32_as_f32x4!(P5, 1. / 5040.);
1477    const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1478    const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1479    let max_x = f32x4::from(87.3);
1480    let r = (self * Self::LOG2_E).round();
1481    let x = r.mul_neg_add(LN2D_HI, self);
1482    let x = r.mul_neg_add(LN2D_LO, x);
1483    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1484    let x2 = x * x;
1485    let z = z.mul_add(x2, x);
1486    let n2 = Self::vm_pow2n(r);
1487    let z = (z + Self::ONE) * n2;
1488    // check for overflow
1489    let in_range = self.abs().simd_lt(max_x);
1490    let in_range = in_range & self.is_finite();
1491    in_range.blend(z, Self::ZERO)
1492  }
1493
1494  #[inline]
1495  fn exponent(self) -> f32x4 {
1496    const_f32_as_f32x4!(pow2_23, 8388608.0);
1497    const_f32_as_f32x4!(bias, 127.0);
1498    let a = cast::<_, u32x4>(self);
1499    let b = a >> 23;
1500    let c = b | cast::<_, u32x4>(pow2_23);
1501    let d = cast::<_, f32x4>(c);
1502    let e = d - (pow2_23 + bias);
1503    e
1504  }
1505
1506  #[inline]
1507  fn fraction_2(self) -> Self {
1508    let t1 = cast::<_, u32x4>(self);
1509    let t2 = cast::<_, u32x4>(
1510      (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1511    );
1512    cast::<_, f32x4>(t2)
1513  }
1514  #[inline]
1515  fn is_zero_or_subnormal(self) -> Self {
1516    let t = cast::<_, i32x4>(self);
1517    let t = t & i32x4::splat(0x7F800000);
1518    i32x4::round_float(t.simd_eq(i32x4::splat(0)))
1519  }
1520  #[inline]
1521  fn infinity() -> Self {
1522    cast::<_, f32x4>(i32x4::splat(0x7F800000))
1523  }
1524  #[inline]
1525  fn nan_log() -> Self {
1526    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1527  }
1528  #[inline]
1529  fn nan_pow() -> Self {
1530    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1531  }
1532  #[inline]
1533  pub fn sign_bit(self) -> Self {
1534    let t1 = cast::<_, i32x4>(self);
1535    let t2 = t1 >> 31;
1536    !cast::<_, f32x4>(t2).simd_eq(f32x4::ZERO)
1537  }
1538
1539  /// horizontal add of all the elements of the vector
1540  #[inline]
1541  #[must_use]
1542  pub fn reduce_add(self) -> f32 {
1543    let arr: [f32; 4] = cast(self);
1544    arr.iter().sum()
1545  }
1546
1547  /// Natural log (ln(x))
1548  #[inline]
1549  #[must_use]
1550  pub fn ln(self) -> Self {
1551    const_f32_as_f32x4!(HALF, 0.5);
1552    const_f32_as_f32x4!(P0, 3.3333331174E-1);
1553    const_f32_as_f32x4!(P1, -2.4999993993E-1);
1554    const_f32_as_f32x4!(P2, 2.0000714765E-1);
1555    const_f32_as_f32x4!(P3, -1.6668057665E-1);
1556    const_f32_as_f32x4!(P4, 1.4249322787E-1);
1557    const_f32_as_f32x4!(P5, -1.2420140846E-1);
1558    const_f32_as_f32x4!(P6, 1.1676998740E-1);
1559    const_f32_as_f32x4!(P7, -1.1514610310E-1);
1560    const_f32_as_f32x4!(P8, 7.0376836292E-2);
1561    const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1562    const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1563    const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1564
1565    let x1 = self;
1566    let x = Self::fraction_2(x1);
1567    let e = Self::exponent(x1);
1568    let mask = x.simd_gt(Self::SQRT_2 * HALF);
1569    let x = (!mask).blend(x + x, x);
1570    let fe = mask.blend(e + Self::ONE, e);
1571    let x = x - Self::ONE;
1572    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1573    let x2 = x * x;
1574    let res = x2 * x * res;
1575    let res = fe.mul_add(LN2F_LO, res);
1576    let res = res + x2.mul_neg_add(HALF, x);
1577    let res = fe.mul_add(LN2F_HI, res);
1578    let overflow = !self.is_finite();
1579    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1580    let mask = overflow | underflow;
1581    if !mask.any() {
1582      res
1583    } else {
1584      let is_zero = self.is_zero_or_subnormal();
1585      let res = underflow.blend(Self::nan_log(), res);
1586      let res = is_zero.blend(Self::infinity(), res);
1587      let res = overflow.blend(self, res);
1588      res
1589    }
1590  }
1591
1592  #[inline]
1593  #[must_use]
1594  pub fn log2(self) -> Self {
1595    Self::ln(self) * Self::LOG2_E
1596  }
1597  #[inline]
1598  #[must_use]
1599  pub fn log10(self) -> Self {
1600    Self::ln(self) * Self::LOG10_E
1601  }
1602
1603  #[inline]
1604  #[must_use]
1605  pub fn pow_f32x4(self, y: f32x4) -> Self {
1606    const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1607    const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1608    const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1609    const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1610    const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1611    const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1612    const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1613    const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1614    const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1615    const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1616    const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1617
1618    const_f32_as_f32x4!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1619    const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1620    const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1621    const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1622    const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1623    const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1624
1625    let x1 = self.abs();
1626    let x = x1.fraction_2();
1627
1628    let mask = x.simd_gt(f32x4::SQRT_2 * f32x4::HALF);
1629    let x = (!mask).blend(x + x, x);
1630
1631    let x = x - f32x4::ONE;
1632    let x2 = x * x;
1633    let lg1 = polynomial_8!(
1634      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1635    );
1636    let lg1 = lg1 * x2 * x;
1637
1638    let ef = x1.exponent();
1639    let ef = mask.blend(ef + f32x4::ONE, ef);
1640
1641    let e1 = (ef * y).round();
1642    let yr = ef.mul_sub(y, e1);
1643
1644    let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1645    let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1646    let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1647
1648    let e2 = (lg * y * f32x4::LOG2_E).round();
1649    let v = lg.mul_sub(y, e2 * ln2f_hi);
1650    let v = e2.mul_neg_add(ln2f_lo, v);
1651    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1652
1653    let x = v;
1654    let e3 = (x * f32x4::LOG2_E).round();
1655    let x = e3.mul_neg_add(f32x4::LN_2, x);
1656    let x2 = x * x;
1657    let z = x2.mul_add(
1658      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1659      x + f32x4::ONE,
1660    );
1661
1662    let ee = e1 + e2 + e3;
1663    let ei = cast::<_, i32x4>(ee.round_int());
1664    let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1665
1666    let overflow = cast::<_, f32x4>(ej.simd_gt(i32x4::splat(0x0FF)))
1667      | (ee.simd_gt(f32x4::splat(300.0)));
1668    let underflow = cast::<_, f32x4>(ej.simd_lt(i32x4::splat(0x000)))
1669      | (ee.simd_lt(f32x4::splat(-300.0)));
1670
1671    // Add exponent by integer addition
1672    let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1673
1674    // Check for overflow/underflow
1675    let z = if (overflow | underflow).any() {
1676      let z = underflow.blend(f32x4::ZERO, z);
1677      overflow.blend(Self::infinity(), z)
1678    } else {
1679      z
1680    };
1681
1682    // Check for self == 0
1683    let x_zero = self.is_zero_or_subnormal();
1684    let z = x_zero.blend(
1685      y.simd_lt(f32x4::ZERO).blend(
1686        Self::infinity(),
1687        y.simd_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1688      ),
1689      z,
1690    );
1691
1692    let x_sign = self.sign_bit();
1693    let z = if x_sign.any() {
1694      // Y into an integer
1695      let yi = y.simd_eq(y.round());
1696      // Is y odd?
1697      let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1698
1699      let z1 =
1700        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1701      x_sign.blend(z1, z)
1702    } else {
1703      z
1704    };
1705
1706    let x_finite = self.is_finite();
1707    let y_finite = y.is_finite();
1708    let e_finite = ee.is_finite();
1709    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1710      return z;
1711    }
1712
1713    (self.is_nan() | y.is_nan()).blend(self + y, z)
1714  }
1715
1716  #[inline]
1717  pub fn powf(self, y: f32) -> Self {
1718    Self::pow_f32x4(self, f32x4::splat(y))
1719  }
1720
1721  #[must_use]
1722  #[inline]
1723  pub fn unpack_lo(self, b: Self) -> Self {
1724    pick! {
1725      if #[cfg(target_feature="sse")] {
1726        Self { sse: unpack_low_m128(self.sse, b.sse) }
1727      } else if #[cfg(target_feature="simd128")] {
1728        Self {
1729          simd: u32x4_shuffle::<0, 4, 1, 5>(self.simd, b.simd)
1730        }
1731      } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))]{
1732        unsafe {Self { neon: vzip1q_f32(self.neon, b.neon) }}
1733      } else {
1734        Self { arr: [
1735          self.arr[0],
1736          b.arr[0],
1737          self.arr[1],
1738          b.arr[1],
1739        ]}
1740      }
1741    }
1742  }
1743
1744  #[must_use]
1745  #[inline]
1746  pub fn unpack_hi(self, b: Self) -> Self {
1747    pick! {
1748      if #[cfg(target_feature="sse")] {
1749        Self { sse: unpack_high_m128(self.sse, b.sse) }
1750      } else if #[cfg(target_feature="simd128")] {
1751        Self {
1752          simd: u32x4_shuffle::<2, 6, 3, 7>(self.simd, b.simd)
1753        }
1754      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1755        unsafe {Self { neon: vzip2q_f32(self.neon, b.neon) }}
1756      } else {
1757        Self { arr: [
1758          self.arr[2],
1759          b.arr[2],
1760          self.arr[3],
1761          b.arr[3],
1762        ]}
1763      }
1764    }
1765  }
1766
1767  /// Transpose matrix of 4x4 `f32` matrix. Currently only accelerated on SSE.
1768  #[must_use]
1769  #[inline]
1770  pub fn transpose(data: [f32x4; 4]) -> [f32x4; 4] {
1771    pick! {
1772      if #[cfg(target_feature="sse")] {
1773        let mut e0 = data[0];
1774        let mut e1 = data[1];
1775        let mut e2 = data[2];
1776        let mut e3 = data[3];
1777
1778        transpose_four_m128(&mut e0.sse, &mut e1.sse, &mut e2.sse, &mut e3.sse);
1779
1780        [e0, e1, e2, e3]
1781      } else if #[cfg(any(all(target_feature="neon",target_arch="aarch64"), target_feature="simd128"))] {
1782        let a = data[0].unpack_lo(data[2]);
1783        let b = data[1].unpack_lo(data[3]);
1784        let c = data[0].unpack_hi(data[2]);
1785        let d = data[1].unpack_hi(data[3]);
1786
1787        [
1788          a.unpack_lo(b),
1789          a.unpack_hi(b),
1790          c.unpack_lo(d),
1791          c.unpack_hi(d),
1792        ]
1793      } else {
1794        #[inline(always)]
1795        fn transpose_column(data: &[f32x4; 4], index: usize) -> f32x4 {
1796          f32x4::new([
1797            data[0].as_array()[index],
1798            data[1].as_array()[index],
1799            data[2].as_array()[index],
1800            data[3].as_array()[index],
1801          ])
1802        }
1803
1804        [
1805          transpose_column(&data, 0),
1806          transpose_column(&data, 1),
1807          transpose_column(&data, 2),
1808          transpose_column(&data, 3),
1809        ]
1810      }
1811    }
1812  }
1813
1814  #[inline]
1815  pub fn to_array(self) -> [f32; 4] {
1816    cast(self)
1817  }
1818
1819  #[inline]
1820  pub fn as_array(&self) -> &[f32; 4] {
1821    cast_ref(self)
1822  }
1823
1824  #[inline]
1825  pub fn as_mut_array(&mut self) -> &mut [f32; 4] {
1826    cast_mut(self)
1827  }
1828
1829  #[inline]
1830  pub fn from_i32x4(v: i32x4) -> Self {
1831    pick! {
1832      if #[cfg(target_feature="sse2")] {
1833        Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1834      } else if #[cfg(target_feature="simd128")] {
1835        Self { simd: f32x4_convert_i32x4(v.simd) }
1836      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1837        Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1838      } else {
1839        Self { arr: [
1840            v.as_array()[0] as f32,
1841            v.as_array()[1] as f32,
1842            v.as_array()[2] as f32,
1843            v.as_array()[3] as f32,
1844          ] }
1845      }
1846    }
1847  }
1848}