1use super::*;
2
3pick! {
4 if #[cfg(target_feature="avx512f")] {
5 #[derive(Default, Clone, Copy, PartialEq)]
6 #[repr(C, align(64))]
7 pub struct f32x16 { pub(crate) avx512: m512 }
8 } else {
9 #[derive(Default, Clone, Copy, PartialEq)]
10 #[repr(C, align(64))]
11 pub struct f32x16 { pub(crate) a : f32x8, pub(crate) b : f32x8 }
12 }
13}
14
15macro_rules! const_f32_as_f32x16 {
16 ($i:ident, $f:expr) => {
17 #[allow(non_upper_case_globals)]
18 pub const $i: f32x16 = f32x16::new([$f; 16]);
19 };
20}
21
22impl f32x16 {
23 const_f32_as_f32x16!(ONE, 1.0);
24 const_f32_as_f32x16!(HALF, 0.5);
25 const_f32_as_f32x16!(ZERO, 0.0);
26 const_f32_as_f32x16!(E, core::f32::consts::E);
27 const_f32_as_f32x16!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
28 const_f32_as_f32x16!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
29 const_f32_as_f32x16!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
30 const_f32_as_f32x16!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
31 const_f32_as_f32x16!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
32 const_f32_as_f32x16!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
33 const_f32_as_f32x16!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
34 const_f32_as_f32x16!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
35 const_f32_as_f32x16!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
36 const_f32_as_f32x16!(LN_2, core::f32::consts::LN_2);
37 const_f32_as_f32x16!(LN_10, core::f32::consts::LN_10);
38 const_f32_as_f32x16!(LOG2_E, core::f32::consts::LOG2_E);
39 const_f32_as_f32x16!(LOG10_E, core::f32::consts::LOG10_E);
40 const_f32_as_f32x16!(LOG10_2, core::f32::consts::LOG10_2);
41 const_f32_as_f32x16!(LOG2_10, core::f32::consts::LOG2_10);
42 const_f32_as_f32x16!(PI, core::f32::consts::PI);
43 const_f32_as_f32x16!(SQRT_2, core::f32::consts::SQRT_2);
44 const_f32_as_f32x16!(TAU, core::f32::consts::TAU);
45}
46
47unsafe impl Zeroable for f32x16 {}
48unsafe impl Pod for f32x16 {}
49
50impl AlignTo for f32x16 {
51 type Elem = f32;
52}
53
54impl Add for f32x16 {
55 type Output = Self;
56 #[inline]
57 fn add(self, rhs: Self) -> Self::Output {
58 pick! {
59 if #[cfg(target_feature="avx512f")] {
60 Self { avx512: add_m512(self.avx512, rhs.avx512) }
61 } else {
62 Self {
63 a : self.a.add(rhs.a),
64 b : self.b.add(rhs.b),
65 }
66 }
67 }
68 }
69}
70
71impl Sub for f32x16 {
72 type Output = Self;
73 #[inline]
74 fn sub(self, rhs: Self) -> Self::Output {
75 pick! {
76 if #[cfg(target_feature="avx512f")] {
77 Self { avx512: sub_m512(self.avx512, rhs.avx512) }
78 } else {
79 Self {
80 a : self.a.sub(rhs.a),
81 b : self.b.sub(rhs.b),
82 }
83 }
84 }
85 }
86}
87
88impl Mul for f32x16 {
89 type Output = Self;
90 #[inline]
91 fn mul(self, rhs: Self) -> Self::Output {
92 pick! {
93 if #[cfg(target_feature="avx512f")] {
94 Self { avx512: mul_m512(self.avx512, rhs.avx512) }
95 } else {
96 Self { a: self.a.mul(rhs.a), b: self.b.mul(rhs.b) }
97 }
98 }
99 }
100}
101
102impl Div for f32x16 {
103 type Output = Self;
104 #[inline]
105 fn div(self, rhs: Self) -> Self::Output {
106 pick! {
107 if #[cfg(target_feature="avx512f")] {
108 Self { avx512: div_m512(self.avx512, rhs.avx512) }
109 } else {
110 Self { a: self.a.div(rhs.a), b: self.b.div(rhs.b) }
111 }
112 }
113 }
114}
115
116impl Neg for f32x16 {
117 type Output = Self;
118 #[inline]
119 fn neg(self) -> Self::Output {
120 pick! {
121 if #[cfg(target_feature="avx512f")] {
122 Self { avx512: bitxor_m512(self.avx512, Self::splat(-0.0).avx512) }
123 } else {
124 Self {
125 a : self.a.neg(),
126 b : self.b.neg(),
127 }
128 }
129 }
130 }
131}
132
133impl Add<f32> for f32x16 {
134 type Output = Self;
135 #[inline]
136 fn add(self, rhs: f32) -> Self::Output {
137 self.add(Self::splat(rhs))
138 }
139}
140
141impl Sub<f32> for f32x16 {
142 type Output = Self;
143 #[inline]
144 fn sub(self, rhs: f32) -> Self::Output {
145 self.sub(Self::splat(rhs))
146 }
147}
148
149impl Mul<f32> for f32x16 {
150 type Output = Self;
151 #[inline]
152 fn mul(self, rhs: f32) -> Self::Output {
153 self.mul(Self::splat(rhs))
154 }
155}
156
157impl Div<f32> for f32x16 {
158 type Output = Self;
159 #[inline]
160 fn div(self, rhs: f32) -> Self::Output {
161 self.div(Self::splat(rhs))
162 }
163}
164
165impl Add<f32x16> for f32 {
166 type Output = f32x16;
167 #[inline]
168 fn add(self, rhs: f32x16) -> Self::Output {
169 f32x16::splat(self).add(rhs)
170 }
171}
172
173impl Sub<f32x16> for f32 {
174 type Output = f32x16;
175 #[inline]
176 fn sub(self, rhs: f32x16) -> Self::Output {
177 f32x16::splat(self).sub(rhs)
178 }
179}
180
181impl Mul<f32x16> for f32 {
182 type Output = f32x16;
183 #[inline]
184 fn mul(self, rhs: f32x16) -> Self::Output {
185 f32x16::splat(self).mul(rhs)
186 }
187}
188
189impl Div<f32x16> for f32 {
190 type Output = f32x16;
191 #[inline]
192 fn div(self, rhs: f32x16) -> Self::Output {
193 f32x16::splat(self).div(rhs)
194 }
195}
196
197impl BitAnd for f32x16 {
198 type Output = Self;
199 #[inline]
200 fn bitand(self, rhs: Self) -> Self::Output {
201 pick! {
202 if #[cfg(target_feature="avx512f")] {
203 Self { avx512: bitand_m512(self.avx512, rhs.avx512) }
204 } else {
205 Self {
206 a : self.a.bitand(rhs.a),
207 b : self.b.bitand(rhs.b),
208 }
209 }
210 }
211 }
212}
213
214impl BitOr for f32x16 {
215 type Output = Self;
216 #[inline]
217 fn bitor(self, rhs: Self) -> Self::Output {
218 pick! {
219 if #[cfg(target_feature="avx512f")] {
220 Self { avx512: bitor_m512(self.avx512, rhs.avx512) }
221 } else {
222 Self {
223 a : self.a.bitor(rhs.a),
224 b : self.b.bitor(rhs.b),
225 }
226 }
227 }
228 }
229}
230
231impl BitXor for f32x16 {
232 type Output = Self;
233 #[inline]
234 fn bitxor(self, rhs: Self) -> Self::Output {
235 pick! {
236 if #[cfg(target_feature="avx512f")] {
237 Self { avx512: bitxor_m512(self.avx512, rhs.avx512) }
238 } else {
239 Self {
240 a : self.a.bitxor(rhs.a),
241 b : self.b.bitxor(rhs.b),
242 }
243 }
244 }
245 }
246}
247
248impl CmpEq for f32x16 {
249 type Output = Self;
250 #[inline]
251 fn simd_eq(self, rhs: Self) -> Self::Output {
252 pick! {
253 if #[cfg(target_feature="avx512f")] {
254 Self { avx512: cmp_op_mask_m512::<{cmp_op!(EqualOrdered)}>(self.avx512, rhs.avx512) }
255 } else {
256 Self {
257 a : self.a.simd_eq(rhs.a),
258 b : self.b.simd_eq(rhs.b),
259 }
260 }
261 }
262 }
263}
264
265impl CmpGt for f32x16 {
266 type Output = Self;
267 #[inline]
268 fn simd_gt(self, rhs: Self) -> Self::Output {
269 pick! {
270 if #[cfg(target_feature="avx512f")] {
271 Self { avx512: cmp_op_mask_m512::<{cmp_op!(GreaterThanOrdered)}>(self.avx512, rhs.avx512) }
272 } else {
273 Self {
274 a : self.a.simd_gt(rhs.a),
275 b : self.b.simd_gt(rhs.b),
276 }
277 }
278 }
279 }
280}
281
282impl CmpGe for f32x16 {
283 type Output = Self;
284 #[inline]
285 fn simd_ge(self, rhs: Self) -> Self::Output {
286 pick! {
287 if #[cfg(target_feature="avx512f")] {
288 Self { avx512: cmp_op_mask_m512::<{cmp_op!(GreaterEqualOrdered)}>(self.avx512, rhs.avx512) }
289 } else {
290 Self {
291 a : self.a.simd_ge(rhs.a),
292 b : self.b.simd_ge(rhs.b),
293 }
294 }
295 }
296 }
297}
298
299impl CmpLt for f32x16 {
300 type Output = Self;
301 #[inline]
302 fn simd_lt(self, rhs: Self) -> Self::Output {
303 pick! {
304 if #[cfg(target_feature="avx512f")] {
305 Self { avx512: cmp_op_mask_m512::<{cmp_op!(LessThanOrdered)}>(self.avx512, rhs.avx512) }
306 } else {
307 Self {
308 a : self.a.simd_lt(rhs.a),
309 b : self.b.simd_lt(rhs.b),
310 }
311 }
312 }
313 }
314}
315
316impl CmpLe for f32x16 {
317 type Output = Self;
318 #[inline]
319 fn simd_le(self, rhs: Self) -> Self::Output {
320 pick! {
321 if #[cfg(target_feature="avx512f")] {
322 Self { avx512: cmp_op_mask_m512::<{cmp_op!(LessEqualOrdered)}>(self.avx512, rhs.avx512) }
323 } else {
324 Self {
325 a : self.a.simd_le(rhs.a),
326 b : self.b.simd_le(rhs.b),
327 }
328 }
329 }
330 }
331}
332
333impl CmpNe for f32x16 {
334 type Output = Self;
335 #[inline]
336 fn simd_ne(self, rhs: Self) -> Self::Output {
337 pick! {
338 if #[cfg(target_feature="avx512f")] {
339 Self { avx512: cmp_op_mask_m512::<{cmp_op!(NotEqualOrdered)}>(self.avx512, rhs.avx512) }
340 } else {
341 Self {
342 a : self.a.simd_ne(rhs.a),
343 b : self.b.simd_ne(rhs.b),
344 }
345 }
346 }
347 }
348}
349
350impl f32x16 {
351 #[inline]
352 #[must_use]
353 pub const fn new(array: [f32; 16]) -> Self {
354 unsafe { core::mem::transmute(array) }
355 }
356
357 #[inline]
358 #[must_use]
359 pub fn blend(self, t: Self, f: Self) -> Self {
360 pick! {
361 if #[cfg(target_feature="avx512f")] {
362 Self { avx512: blend_varying_m512(f.avx512, t.avx512, movepi32_mask_m512(self.avx512)) }
363 } else {
364 Self {
365 a : self.a.blend(t.a, f.a),
366 b : self.b.blend(t.b, f.b),
367 }
368 }
369 }
370 }
371
372 #[inline]
373 #[must_use]
374 pub fn abs(self) -> Self {
375 pick! {
376 if #[cfg(target_feature="avx512f")] {
377 let non_sign_bits = f32x16::from(f32::from_bits(i32::MAX as u32));
378 self & non_sign_bits
379 } else {
380 Self {
381 a : self.a.abs(),
382 b : self.b.abs(),
383 }
384 }
385 }
386 }
387
388 #[inline]
389 #[must_use]
390 pub fn floor(self) -> Self {
391 pick! {
392 if #[cfg(target_feature="avx512f")] {
393 Self { avx512: round_m512::<{round_op!(NegInf)}>(self.avx512) }
394 } else {
395 Self {
396 a : self.a.floor(),
397 b : self.b.floor(),
398 }
399 }
400 }
401 }
402
403 #[inline]
404 #[must_use]
405 pub fn ceil(self) -> Self {
406 pick! {
407 if #[cfg(target_feature="avx512f")] {
408 Self { avx512: round_m512::<{round_op!(PosInf)}>(self.avx512) }
409 } else {
410 Self {
411 a : self.a.ceil(),
412 b : self.b.ceil(),
413 }
414 }
415 }
416 }
417
418 #[inline]
422 #[must_use]
423 pub fn fast_max(self, rhs: Self) -> Self {
424 pick! {
425 if #[cfg(target_feature="avx512f")] {
426 Self { avx512: max_m512(self.avx512, rhs.avx512) }
427 } else {
428 Self {
429 a: self.a.fast_max(rhs.a),
430 b: self.b.fast_max(rhs.b),
431 }
432 }
433 }
434 }
435
436 #[inline]
437 #[must_use]
438 pub fn max(self, rhs: Self) -> Self {
439 pick! {
440 if #[cfg(target_feature="avx512f")] {
441 rhs.is_nan().blend(self, Self { avx512: max_m512(self.avx512, rhs.avx512) })
445 } else {
446 Self {
447 a: self.a.max(rhs.a),
448 b: self.b.max(rhs.b),
449 }
450 }
451 }
452 }
453
454 #[inline]
458 #[must_use]
459 pub fn fast_min(self, rhs: Self) -> Self {
460 pick! {
461 if #[cfg(target_feature="avx512f")] {
462 Self { avx512: min_m512(self.avx512, rhs.avx512) }
463 } else {
464 Self {
465 a: self.a.fast_min(rhs.a),
466 b: self.b.fast_min(rhs.b),
467 }
468 }
469 }
470 }
471
472 #[inline]
473 #[must_use]
474 pub fn min(self, rhs: Self) -> Self {
475 pick! {
476 if #[cfg(target_feature="avx512f")] {
477 rhs.is_nan().blend(self, Self { avx512: min_m512(self.avx512, rhs.avx512) })
481 } else {
482 Self {
483 a: self.a.min(rhs.a),
484 b: self.b.min(rhs.b),
485 }
486 }
487 }
488 }
489
490 #[inline]
491 #[must_use]
492 pub fn is_nan(self) -> Self {
493 pick! {
494 if #[cfg(target_feature = "avx512f")] {
495 Self { avx512: cmp_op_mask_m512::<{cmp_op!(Unordered)}>(self.avx512, self.avx512) }
496 } else {
497 Self {
498 a: self.a.is_nan(),
499 b: self.b.is_nan(),
500 }
501 }
502 }
503 }
504
505 #[inline]
506 #[must_use]
507 pub fn is_finite(self) -> Self {
508 let shifted_exp_mask = u32x16::splat(0xFF000000);
509 let u: u32x16 = cast(self);
510 let shift_u = u << 1_u32;
511 let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
512 cast(out)
513 }
514
515 #[inline]
516 #[must_use]
517 pub fn is_inf(self) -> Self {
518 let shifted_inf = u32x16::from(0xFF000000);
519 let u: u32x16 = cast(self);
520 let shift_u = u << 1_u64;
521 let out = (shift_u).simd_eq(shifted_inf);
522 cast(out)
523 }
524
525 #[inline]
526 #[must_use]
527 pub fn round(self) -> Self {
528 pick! {
529 if #[cfg(target_feature="avx512f")] {
530 Self { avx512: round_m512::<{round_op!(Nearest)}>(self.avx512) }
531 } else {
532 Self {
533 a: self.a.round(),
534 b: self.b.round(),
535 }
536 }
537 }
538 }
539
540 #[inline]
544 #[must_use]
545 pub fn fast_round_int(self) -> i32x16 {
546 pick! {
547 if #[cfg(target_feature="avx512f")] {
548 cast(convert_to_i32_m512i_from_m512(self.avx512))
549 } else {
550 i32x16 {
551 a: self.a.fast_round_int(),
552 b: self.b.fast_round_int(),
553 }
554 }
555 }
556 }
557
558 #[inline]
562 #[must_use]
563 pub fn round_int(self) -> i32x16 {
564 pick! {
565 if #[cfg(target_feature="avx512f")] {
566 let non_nan_mask = self.simd_eq(self);
568 let non_nan = self & non_nan_mask;
569 let flip_to_max: i32x16 = cast(self.simd_ge(Self::splat(2147483648.0)));
570 let cast: i32x16 = cast(convert_to_i32_m512i_from_m512(non_nan.avx512));
571 flip_to_max ^ cast
572 } else {
573 i32x16 {
574 a: self.a.round_int(),
575 b: self.b.round_int(),
576 }
577 }
578 }
579 }
580
581 #[inline]
585 #[must_use]
586 pub fn fast_trunc_int(self) -> i32x16 {
587 pick! {
588 if #[cfg(all(target_feature="avx512f"))] {
589 cast(convert_truncate_m512_i32_m512i(self.avx512))
590 } else {
591 cast([
592 self.a.fast_trunc_int(),
593 self.b.fast_trunc_int(),
594 ])
595 }
596 }
597 }
598
599 #[inline]
603 #[must_use]
604 pub fn trunc_int(self) -> i32x16 {
605 pick! {
606 if #[cfg(target_feature="avx512f")] {
607 let non_nan_mask = self.simd_eq(self);
609 let non_nan = self & non_nan_mask;
610 let flip_to_max: i32x16 = cast(self.simd_ge(Self::splat(2147483648.0)));
611 let cast: i32x16 = cast(convert_truncate_m512_i32_m512i(non_nan.avx512));
612 flip_to_max ^ cast
613 } else {
614 cast([
615 self.a.trunc_int(),
616 self.b.trunc_int(),
617 ])
618 }
619 }
620 }
621
622 #[inline]
648 #[must_use]
649 pub fn mul_add(self, m: Self, a: Self) -> Self {
650 pick! {
651 if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
652 Self { avx512: fused_mul_add_m512(self.avx512, m.avx512, a.avx512) }
653 } else if #[cfg(target_feature="avx512f")] {
654 (self * m) + a
656 } else {
657 Self {
658 a: self.a.mul_add(m.a, a.a),
659 b: self.b.mul_add(m.b, a.b),
660 }
661 }
662 }
663 }
664
665 #[inline]
691 #[must_use]
692 pub fn mul_sub(self, m: Self, s: Self) -> Self {
693 pick! {
694 if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
695 Self { avx512: fused_mul_sub_m512(self.avx512, m.avx512, s.avx512) }
696 } else if #[cfg(target_feature="avx512f")] {
697 (self * m) - s
699 } else {
700 Self {
701 a: self.a.mul_sub(m.a, s.a),
702 b: self.b.mul_sub(m.b, s.b),
703 }
704 }
705 }
706 }
707
708 #[inline]
734 #[must_use]
735 pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
736 pick! {
737 if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
738 Self { avx512: fused_mul_neg_add_m512(self.avx512, m.avx512, a.avx512) }
739 } else if #[cfg(target_feature="avx512f")] {
740 a - (self * m)
742 } else {
743 Self {
744 a: self.a.mul_neg_add(m.a, a.a),
745 b: self.b.mul_neg_add(m.b, a.b),
746 }
747 }
748 }
749 }
750
751 #[inline]
777 #[must_use]
778 pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
779 pick! {
780 if #[cfg(all(target_feature="avx512f",target_feature="fma"))] {
781 Self { avx512: fused_mul_neg_sub_m512(self.avx512, m.avx512, s.avx512) }
782 } else if #[cfg(target_feature="avx512f")] {
783 -(self * m) - s
785 } else {
786 Self {
787 a: self.a.mul_neg_sub(m.a, s.a),
788 b: self.b.mul_neg_sub(m.b, s.b),
789 }
790 }
791 }
792 }
793
794 #[inline]
795 #[must_use]
796 pub fn flip_signs(self, signs: Self) -> Self {
797 self ^ (signs & Self::from(-0.0))
798 }
799
800 #[inline]
801 #[must_use]
802 pub fn copysign(self, sign: Self) -> Self {
803 let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
804 (self & magnitude_mask) | (sign & Self::from(-0.0))
805 }
806
807 #[inline]
808 pub fn asin_acos(self) -> (Self, Self) {
809 const_f32_as_f32x16!(P4asinf, 4.2163199048E-2);
812 const_f32_as_f32x16!(P3asinf, 2.4181311049E-2);
813 const_f32_as_f32x16!(P2asinf, 4.5470025998E-2);
814 const_f32_as_f32x16!(P1asinf, 7.4953002686E-2);
815 const_f32_as_f32x16!(P0asinf, 1.6666752422E-1);
816
817 let xa = self.abs();
818 let big = xa.simd_ge(f32x16::splat(0.5));
819
820 let x1 = f32x16::splat(0.5) * (f32x16::ONE - xa);
821 let x2 = xa * xa;
822 let x3 = big.blend(x1, x2);
823
824 let xb = x1.sqrt();
825
826 let x4 = big.blend(xb, xa);
827
828 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
829 let z = z.mul_add(x3 * x4, x4);
830
831 let z1 = z + z;
832
833 let z3 = self.simd_lt(f32x16::ZERO).blend(f32x16::PI - z1, z1);
835 let z4 = f32x16::FRAC_PI_2 - z.flip_signs(self);
836 let acos = big.blend(z3, z4);
837
838 let z3 = f32x16::FRAC_PI_2 - z1;
840 let asin = big.blend(z3, z);
841 let asin = asin.flip_signs(self);
842
843 (asin, acos)
844 }
845
846 #[inline]
847 #[must_use]
848 pub fn asin(self) -> Self {
849 const_f32_as_f32x16!(P4asinf, 4.2163199048E-2);
852 const_f32_as_f32x16!(P3asinf, 2.4181311049E-2);
853 const_f32_as_f32x16!(P2asinf, 4.5470025998E-2);
854 const_f32_as_f32x16!(P1asinf, 7.4953002686E-2);
855 const_f32_as_f32x16!(P0asinf, 1.6666752422E-1);
856
857 let xa = self.abs();
858 let big = xa.simd_ge(f32x16::splat(0.5));
859
860 let x1 = f32x16::splat(0.5) * (f32x16::ONE - xa);
861 let x2 = xa * xa;
862 let x3 = big.blend(x1, x2);
863
864 let xb = x1.sqrt();
865
866 let x4 = big.blend(xb, xa);
867
868 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
869 let z = z.mul_add(x3 * x4, x4);
870
871 let z1 = z + z;
872
873 let z3 = f32x16::FRAC_PI_2 - z1;
875 let asin = big.blend(z3, z);
876 let asin = asin.flip_signs(self);
877
878 asin
879 }
880
881 #[inline]
882 #[must_use]
883 pub fn acos(self) -> Self {
884 const_f32_as_f32x16!(P4asinf, 4.2163199048E-2);
887 const_f32_as_f32x16!(P3asinf, 2.4181311049E-2);
888 const_f32_as_f32x16!(P2asinf, 4.5470025998E-2);
889 const_f32_as_f32x16!(P1asinf, 7.4953002686E-2);
890 const_f32_as_f32x16!(P0asinf, 1.6666752422E-1);
891
892 let xa = self.abs();
893 let big = xa.simd_ge(f32x16::splat(0.5));
894
895 let x1 = f32x16::splat(0.5) * (f32x16::ONE - xa);
896 let x2 = xa * xa;
897 let x3 = big.blend(x1, x2);
898
899 let xb = x1.sqrt();
900
901 let x4 = big.blend(xb, xa);
902
903 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
904 let z = z.mul_add(x3 * x4, x4);
905
906 let z1 = z + z;
907
908 let z3 = self.simd_lt(f32x16::ZERO).blend(f32x16::PI - z1, z1);
910 let z4 = f32x16::FRAC_PI_2 - z.flip_signs(self);
911 let acos = big.blend(z3, z4);
912
913 acos
914 }
915
916 #[inline]
917 pub fn atan(self) -> Self {
918 const_f32_as_f32x16!(P3atanf, 8.05374449538E-2);
921 const_f32_as_f32x16!(P2atanf, -1.38776856032E-1);
922 const_f32_as_f32x16!(P1atanf, 1.99777106478E-1);
923 const_f32_as_f32x16!(P0atanf, -3.33329491539E-1);
924
925 let t = self.abs();
926
927 let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
931 let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
932
933 let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
934 s = notsmal & s;
935
936 let mut a = notbig & t;
937 a = notsmal.blend(a - Self::ONE, a);
938 let mut b = notbig & Self::ONE;
939 b = notsmal.blend(b + t, b);
940 let z = a / b;
941
942 let zz = z * z;
943
944 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
946 re = re.mul_add(zz * z, z) + s;
947
948 re = (self.sign_bit()).blend(-re, re);
950
951 re
952 }
953
954 #[inline]
955 pub fn atan2(self, x: Self) -> Self {
956 const_f32_as_f32x16!(P3atanf, 8.05374449538E-2);
959 const_f32_as_f32x16!(P2atanf, -1.38776856032E-1);
960 const_f32_as_f32x16!(P1atanf, 1.99777106478E-1);
961 const_f32_as_f32x16!(P0atanf, -3.33329491539E-1);
962
963 let y = self;
964
965 let x1 = x.abs();
967 let y1 = y.abs();
968 let swapxy = y1.simd_gt(x1);
969 let mut x2 = swapxy.blend(y1, x1);
971 let mut y2 = swapxy.blend(x1, y1);
972
973 let both_infinite = x.is_inf() & y.is_inf();
975 if both_infinite.any() {
976 let minus_one = -Self::ONE;
977 x2 = both_infinite.blend(x2 & minus_one, x2);
978 y2 = both_infinite.blend(y2 & minus_one, y2);
979 }
980
981 let t = y2 / x2;
983
984 let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
987
988 let a = notsmal.blend(t - Self::ONE, t);
989 let b = notsmal.blend(t + Self::ONE, Self::ONE);
990 let s = notsmal & Self::FRAC_PI_4;
991 let z = a / b;
992
993 let zz = z * z;
994
995 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
997 re = re.mul_add(zz * z, z) + s;
998
999 re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1001 re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1002 re = (x.sign_bit()).blend(Self::PI - re, re);
1003
1004 re = (y.sign_bit()).blend(-re, re);
1006
1007 re
1008 }
1009
1010 #[inline]
1011 #[must_use]
1012 pub fn sin_cos(self) -> (Self, Self) {
1013 const_f32_as_f32x16!(DP1F, 0.78515625_f32 * 2.0);
1017 const_f32_as_f32x16!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1018 const_f32_as_f32x16!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1019
1020 const_f32_as_f32x16!(P0sinf, -1.6666654611E-1);
1021 const_f32_as_f32x16!(P1sinf, 8.3321608736E-3);
1022 const_f32_as_f32x16!(P2sinf, -1.9515295891E-4);
1023
1024 const_f32_as_f32x16!(P0cosf, 4.166664568298827E-2);
1025 const_f32_as_f32x16!(P1cosf, -1.388731625493765E-3);
1026 const_f32_as_f32x16!(P2cosf, 2.443315711809948E-5);
1027
1028 const_f32_as_f32x16!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1029
1030 let xa = self.abs();
1031
1032 let y = (xa * TWO_OVER_PI).round();
1034 let q: i32x16 = y.round_int();
1035
1036 let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1037
1038 let x2 = x * x;
1039 let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1040 let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1041 + f32x16::from(0.5).mul_neg_add(x2, f32x16::from(1.0));
1042
1043 let swap = !(q & i32x16::from(1)).simd_eq(i32x16::from(0));
1044
1045 let mut overflow: f32x16 = cast(q.simd_gt(i32x16::from(0x2000000)));
1046 overflow &= xa.is_finite();
1047 s = overflow.blend(f32x16::from(0.0), s);
1048 c = overflow.blend(f32x16::from(1.0), c);
1049
1050 let mut sin1 = cast::<_, f32x16>(swap).blend(c, s);
1052 let sign_sin: i32x16 = (q << 30) ^ cast::<_, i32x16>(self);
1053 sin1 = sin1.flip_signs(cast(sign_sin));
1054
1055 let mut cos1 = cast::<_, f32x16>(swap).blend(s, c);
1057 let sign_cos: i32x16 = ((q + i32x16::from(1)) & i32x16::from(2)) << 30;
1058 cos1 ^= cast::<_, f32x16>(sign_cos);
1059
1060 (sin1, cos1)
1061 }
1062
1063 #[inline]
1064 #[must_use]
1065 pub fn sin(self) -> Self {
1066 let (s, _) = self.sin_cos();
1067 s
1068 }
1069
1070 #[inline]
1071 #[must_use]
1072 pub fn cos(self) -> Self {
1073 let (_, c) = self.sin_cos();
1074 c
1075 }
1076
1077 #[inline]
1078 #[must_use]
1079 pub fn tan(self) -> Self {
1080 let (s, c) = self.sin_cos();
1081 s / c
1082 }
1083
1084 #[inline]
1085 #[must_use]
1086 pub fn to_degrees(self) -> Self {
1087 const_f32_as_f32x16!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1088 self * RAD_TO_DEG_RATIO
1089 }
1090
1091 #[inline]
1092 #[must_use]
1093 pub fn to_radians(self) -> Self {
1094 const_f32_as_f32x16!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1095 self * DEG_TO_RAD_RATIO
1096 }
1097
1098 #[inline]
1099 #[must_use]
1100 pub fn recip(self) -> Self {
1101 pick! {
1102 if #[cfg(target_feature="avx512f")] {
1103 1.0 / self
1107 } else {
1108 Self {
1109 a : self.a.recip(),
1110 b : self.b.recip(),
1111 }
1112 }
1113 }
1114 }
1115
1116 #[inline]
1117 #[must_use]
1118 pub fn recip_sqrt(self) -> Self {
1119 pick! {
1120 if #[cfg(target_feature="avx512f")] {
1121 self.sqrt().recip()
1125 } else {
1126 Self {
1127 a : self.a.recip_sqrt(),
1128 b : self.b.recip_sqrt(),
1129 }
1130 }
1131 }
1132 }
1133
1134 #[inline]
1135 #[must_use]
1136 pub fn sqrt(self) -> Self {
1137 pick! {
1138 if #[cfg(target_feature="avx512f")] {
1139 Self { avx512: sqrt_m512(self.avx512) }
1140 } else {
1141 Self {
1142 a : self.a.sqrt(),
1143 b : self.b.sqrt(),
1144 }
1145 }
1146 }
1147 }
1148
1149 #[inline]
1150 #[must_use]
1151 #[doc(alias("movemask", "move_mask"))]
1152 pub fn to_bitmask(self) -> u32 {
1153 pick! {
1154 if #[cfg(target_feature="avx512f")] {
1155 movepi32_mask_m512(self.avx512) as u32
1156 } else {
1157 (self.b.to_bitmask() << 8) | self.a.to_bitmask()
1158 }
1159 }
1160 }
1161
1162 #[inline]
1163 #[must_use]
1164 pub fn any(self) -> bool {
1165 pick! {
1166 if #[cfg(target_feature="avx512f")] {
1167 movepi32_mask_m512(self.avx512) != 0
1168 } else {
1169 self.a.any() || self.b.any()
1170 }
1171 }
1172 }
1173
1174 #[inline]
1175 #[must_use]
1176 pub fn all(self) -> bool {
1177 pick! {
1178 if #[cfg(target_feature="avx512f")] {
1179 movepi32_mask_m512(self.avx512) == !0_u16
1180 } else {
1181 self.a.all() && self.b.all()
1182 }
1183 }
1184 }
1185
1186 #[inline]
1187 #[must_use]
1188 pub fn none(self) -> bool {
1189 !self.any()
1190 }
1191
1192 #[inline]
1193 fn vm_pow2n(self) -> Self {
1194 const_f32_as_f32x16!(pow2_23, 8388608.0);
1195 const_f32_as_f32x16!(bias, 127.0);
1196 let a = self + (bias + pow2_23);
1197 let c = cast::<_, i32x16>(a) << 23;
1198 cast::<_, f32x16>(c)
1199 }
1200
1201 #[inline]
1203 #[must_use]
1204 pub fn exp(self) -> Self {
1205 const_f32_as_f32x16!(P0, 1.0 / 2.0);
1206 const_f32_as_f32x16!(P1, 1.0 / 6.0);
1207 const_f32_as_f32x16!(P2, 1. / 24.);
1208 const_f32_as_f32x16!(P3, 1. / 120.);
1209 const_f32_as_f32x16!(P4, 1. / 720.);
1210 const_f32_as_f32x16!(P5, 1. / 5040.);
1211 const_f32_as_f32x16!(LN2D_HI, 0.693359375);
1212 const_f32_as_f32x16!(LN2D_LO, -2.12194440e-4);
1213 let max_x = f32x16::from(87.3);
1214 let r = (self * Self::LOG2_E).round();
1215 let x = r.mul_neg_add(LN2D_HI, self);
1216 let x = r.mul_neg_add(LN2D_LO, x);
1217 let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1218 let x2 = x * x;
1219 let z = z.mul_add(x2, x);
1220 let n2 = Self::vm_pow2n(r);
1221 let z = (z + Self::ONE) * n2;
1222 let in_range = self.abs().simd_lt(max_x);
1224 let in_range = in_range & self.is_finite();
1225 in_range.blend(z, Self::ZERO)
1226 }
1227
1228 #[inline]
1229 fn exponent(self) -> Self {
1230 const_f32_as_f32x16!(pow2_23, 8388608.0);
1231 const_f32_as_f32x16!(bias, 127.0);
1232 let a = cast::<_, u32x16>(self);
1233 let b = a >> 23;
1234 let c = b | cast::<_, u32x16>(pow2_23);
1235 let d = cast::<_, f32x16>(c);
1236 let e = d - (pow2_23 + bias);
1237 e
1238 }
1239
1240 #[inline]
1241 fn fraction_2(self) -> Self {
1242 let t1 = cast::<_, u32x16>(self);
1243 let t2 = cast::<_, u32x16>(
1244 (t1 & u32x16::from(0x007FFFFF)) | u32x16::from(0x3F000000),
1245 );
1246 cast::<_, f32x16>(t2)
1247 }
1248
1249 #[inline]
1250 fn is_zero_or_subnormal(self) -> Self {
1251 let t = cast::<_, i32x16>(self);
1252 let t = t & i32x16::splat(0x7F800000);
1253 i32x16::round_float(t.simd_eq(i32x16::splat(0)))
1254 }
1255
1256 #[inline]
1257 fn infinity() -> Self {
1258 cast::<_, f32x16>(i32x16::splat(0x7F800000))
1259 }
1260
1261 #[inline]
1262 fn nan_log() -> Self {
1263 cast::<_, f32x16>(i32x16::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1264 }
1265
1266 #[inline]
1267 fn nan_pow() -> Self {
1268 cast::<_, f32x16>(i32x16::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1269 }
1270
1271 #[inline]
1272 pub fn sign_bit(self) -> Self {
1273 let t1 = cast::<_, i32x16>(self);
1274 let t2 = t1 >> 31;
1275 !cast::<_, f32x16>(t2).simd_eq(f32x16::ZERO)
1276 }
1277
1278 #[inline]
1280 #[must_use]
1281 pub fn reduce_add(self) -> f32 {
1282 pick! {
1283 if #[cfg(target_feature="avx512f")]{
1284 reduce_add_m512(self.avx512)
1285 } else {
1286 self.a.reduce_add() + self.b.reduce_add()
1287 }
1288 }
1289 }
1290
1291 #[inline]
1293 #[must_use]
1294 pub fn ln(self) -> Self {
1295 const_f32_as_f32x16!(HALF, 0.5);
1296 const_f32_as_f32x16!(P0, 3.3333331174E-1);
1297 const_f32_as_f32x16!(P1, -2.4999993993E-1);
1298 const_f32_as_f32x16!(P2, 2.0000714765E-1);
1299 const_f32_as_f32x16!(P3, -1.6668057665E-1);
1300 const_f32_as_f32x16!(P4, 1.4249322787E-1);
1301 const_f32_as_f32x16!(P5, -1.2420140846E-1);
1302 const_f32_as_f32x16!(P6, 1.1676998740E-1);
1303 const_f32_as_f32x16!(P7, -1.1514610310E-1);
1304 const_f32_as_f32x16!(P8, 7.0376836292E-2);
1305 const_f32_as_f32x16!(LN2F_HI, 0.693359375);
1306 const_f32_as_f32x16!(LN2F_LO, -2.12194440e-4);
1307 const_f32_as_f32x16!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1308
1309 let x1 = self;
1310 let x = Self::fraction_2(x1);
1311 let e = Self::exponent(x1);
1312 let mask = x.simd_gt(Self::SQRT_2 * HALF);
1313 let x = (!mask).blend(x + x, x);
1314 let fe = mask.blend(e + Self::ONE, e);
1315 let x = x - Self::ONE;
1316 let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1317 let x2 = x * x;
1318 let res = x2 * x * res;
1319 let res = fe.mul_add(LN2F_LO, res);
1320 let res = res + x2.mul_neg_add(HALF, x);
1321 let res = fe.mul_add(LN2F_HI, res);
1322 let overflow = !self.is_finite();
1323 let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1324 let mask = overflow | underflow;
1325 if !mask.any() {
1326 res
1327 } else {
1328 let is_zero = self.is_zero_or_subnormal();
1329 let res = underflow.blend(Self::nan_log(), res);
1330 let res = is_zero.blend(Self::infinity(), res);
1331 let res = overflow.blend(self, res);
1332 res
1333 }
1334 }
1335
1336 #[inline]
1337 #[must_use]
1338 pub fn log2(self) -> Self {
1339 Self::ln(self) * Self::LOG2_E
1340 }
1341
1342 #[inline]
1343 #[must_use]
1344 pub fn log10(self) -> Self {
1345 Self::ln(self) * Self::LOG10_E
1346 }
1347
1348 #[inline]
1349 #[must_use]
1350 pub fn pow_f32x16(self, y: Self) -> Self {
1351 const_f32_as_f32x16!(ln2f_hi, 0.693359375);
1352 const_f32_as_f32x16!(ln2f_lo, -2.12194440e-4);
1353 const_f32_as_f32x16!(P0logf, 3.3333331174E-1);
1354 const_f32_as_f32x16!(P1logf, -2.4999993993E-1);
1355 const_f32_as_f32x16!(P2logf, 2.0000714765E-1);
1356 const_f32_as_f32x16!(P3logf, -1.6668057665E-1);
1357 const_f32_as_f32x16!(P4logf, 1.4249322787E-1);
1358 const_f32_as_f32x16!(P5logf, -1.2420140846E-1);
1359 const_f32_as_f32x16!(P6logf, 1.1676998740E-1);
1360 const_f32_as_f32x16!(P7logf, -1.1514610310E-1);
1361 const_f32_as_f32x16!(P8logf, 7.0376836292E-2);
1362
1363 const_f32_as_f32x16!(p2expf, 1.0 / 2.0); const_f32_as_f32x16!(p3expf, 1.0 / 6.0);
1365 const_f32_as_f32x16!(p4expf, 1.0 / 24.0);
1366 const_f32_as_f32x16!(p5expf, 1.0 / 120.0);
1367 const_f32_as_f32x16!(p6expf, 1.0 / 720.0);
1368 const_f32_as_f32x16!(p7expf, 1.0 / 5040.0);
1369
1370 let x1 = self.abs();
1371 let x = x1.fraction_2();
1372 let mask = x.simd_gt(f32x16::SQRT_2 * f32x16::HALF);
1373 let x = (!mask).blend(x + x, x);
1374
1375 let x = x - f32x16::ONE;
1376 let x2 = x * x;
1377 let lg1 = polynomial_8!(
1378 x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1379 );
1380 let lg1 = lg1 * x2 * x;
1381
1382 let ef = x1.exponent();
1383 let ef = mask.blend(ef + f32x16::ONE, ef);
1384 let e1 = (ef * y).round();
1385 let yr = ef.mul_sub(y, e1);
1386
1387 let lg = f32x16::HALF.mul_neg_add(x2, x) + lg1;
1388 let x2_err = (f32x16::HALF * x).mul_sub(x, f32x16::HALF * x2);
1389 let lg_err = f32x16::HALF.mul_add(x2, lg - x) - lg1;
1390
1391 let e2 = (lg * y * f32x16::LOG2_E).round();
1392 let v = lg.mul_sub(y, e2 * ln2f_hi);
1393 let v = e2.mul_neg_add(ln2f_lo, v);
1394 let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x16::LN_2);
1395
1396 let x = v;
1397 let e3 = (x * f32x16::LOG2_E).round();
1398 let x = e3.mul_neg_add(f32x16::LN_2, x);
1399 let x2 = x * x;
1400 let z = x2.mul_add(
1401 polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1402 x + f32x16::ONE,
1403 );
1404
1405 let ee = e1 + e2 + e3;
1406 let ei = cast::<_, i32x16>(ee.round_int());
1407 let ej = cast::<_, i32x16>(ei + (cast::<_, i32x16>(z) >> 23));
1408
1409 let overflow = cast::<_, f32x16>(ej.simd_gt(i32x16::splat(0x0FF)))
1410 | (ee.simd_gt(f32x16::splat(300.0)));
1411 let underflow = cast::<_, f32x16>(ej.simd_lt(i32x16::splat(0x000)))
1412 | (ee.simd_lt(f32x16::splat(-300.0)));
1413
1414 let z = cast::<_, f32x16>(cast::<_, i32x16>(z) + (ei << 23));
1416 let z = underflow.blend(f32x16::ZERO, z);
1418 let z = overflow.blend(Self::infinity(), z);
1419
1420 let x_zero = self.is_zero_or_subnormal();
1422 let z = x_zero.blend(
1423 y.simd_lt(f32x16::ZERO).blend(
1424 Self::infinity(),
1425 y.simd_eq(f32x16::ZERO).blend(f32x16::ONE, f32x16::ZERO),
1426 ),
1427 z,
1428 );
1429
1430 let x_sign = self.sign_bit();
1431 let z = if x_sign.any() {
1432 let yi = y.simd_eq(y.round());
1434
1435 let y_odd = cast::<_, i32x16>(y.round_int() << 31).round_float();
1437
1438 let z1 =
1439 yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1440
1441 x_sign.blend(z1, z)
1442 } else {
1443 z
1444 };
1445
1446 let x_finite = self.is_finite();
1447 let y_finite = y.is_finite();
1448 let e_finite = ee.is_finite();
1449 if (x_finite & y_finite & (e_finite | x_zero)).all() {
1450 return z;
1451 }
1452
1453 (self.is_nan() | y.is_nan()).blend(self + y, z)
1454 }
1455
1456 #[inline]
1457 pub fn powf(self, y: f32) -> Self {
1458 Self::pow_f32x16(self, f32x16::splat(y))
1459 }
1460
1461 #[must_use]
1463 #[inline]
1464 pub fn transpose(data: [f32x16; 16]) -> [f32x16; 16] {
1465 #[inline(always)]
1469 fn transpose_column(data: &[f32x16; 16], index: usize) -> f32x16 {
1470 f32x16::new([
1471 data[0].as_array()[index],
1472 data[1].as_array()[index],
1473 data[2].as_array()[index],
1474 data[3].as_array()[index],
1475 data[4].as_array()[index],
1476 data[5].as_array()[index],
1477 data[6].as_array()[index],
1478 data[7].as_array()[index],
1479 data[8].as_array()[index],
1480 data[9].as_array()[index],
1481 data[10].as_array()[index],
1482 data[11].as_array()[index],
1483 data[12].as_array()[index],
1484 data[13].as_array()[index],
1485 data[14].as_array()[index],
1486 data[15].as_array()[index],
1487 ])
1488 }
1489
1490 [
1491 transpose_column(&data, 0),
1492 transpose_column(&data, 1),
1493 transpose_column(&data, 2),
1494 transpose_column(&data, 3),
1495 transpose_column(&data, 4),
1496 transpose_column(&data, 5),
1497 transpose_column(&data, 6),
1498 transpose_column(&data, 7),
1499 transpose_column(&data, 8),
1500 transpose_column(&data, 9),
1501 transpose_column(&data, 10),
1502 transpose_column(&data, 11),
1503 transpose_column(&data, 12),
1504 transpose_column(&data, 13),
1505 transpose_column(&data, 14),
1506 transpose_column(&data, 15),
1507 ]
1508 }
1509
1510 #[inline]
1511 #[must_use]
1512 pub fn to_array(self) -> [f32; 16] {
1513 cast(self)
1514 }
1515
1516 #[inline]
1517 #[must_use]
1518 pub fn as_array(&self) -> &[f32; 16] {
1519 cast_ref(self)
1520 }
1521
1522 #[inline]
1523 #[must_use]
1524 pub fn as_mut_array(&mut self) -> &mut [f32; 16] {
1525 cast_mut(self)
1526 }
1527
1528 #[inline]
1529 pub fn from_i32x16(v: i32x16) -> Self {
1530 pick! {
1531 if #[cfg(target_feature="avx512f")] {
1532 Self { avx512: convert_to_m512_from_i32_m512i(v.avx512) }
1533 } else {
1534 Self {
1535 a: f32x8::from_i32x8(v.a),
1536 b: f32x8::from_i32x8(v.b),
1537 }
1538 }
1539 }
1540 }
1541}
1542
1543impl Not for f32x16 {
1544 type Output = Self;
1545 #[inline]
1546 fn not(self) -> Self::Output {
1547 pick! {
1548 if #[cfg(target_feature="avx512f")] {
1549 Self { avx512: bitxor_m512(self.avx512, set_splat_m512(f32::from_bits(u32::MAX))) }
1550 } else {
1551 Self {
1552 a : self.a.not(),
1553 b : self.b.not(),
1554 }
1555 }
1556 }
1557 }
1558}