1use super::*;
2
3pick! {
4 if #[cfg(target_feature="sse")] {
5 #[derive(Default, Clone, Copy, PartialEq)]
6 #[repr(C, align(16))]
7 pub struct f32x4 { pub(crate) sse: m128 }
8 } else if #[cfg(target_feature="simd128")] {
9 use core::arch::wasm32::*;
10
11 #[derive(Clone, Copy)]
12 #[repr(transparent)]
13 pub struct f32x4 { pub(crate) simd: v128 }
14
15 impl Default for f32x4 {
16 fn default() -> Self {
17 Self::splat(0.0)
18 }
19 }
20
21 impl PartialEq for f32x4 {
22 fn eq(&self, other: &Self) -> bool {
23 u32x4_all_true(f32x4_eq(self.simd, other.simd))
24 }
25 }
26 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27 use core::arch::aarch64::*;
28 #[repr(C)]
29 #[derive(Copy, Clone)]
30 pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32 impl Default for f32x4 {
33 #[inline]
34 fn default() -> Self {
35 unsafe { Self { neon: vdupq_n_f32(0.0)} }
36 }
37 }
38
39 impl PartialEq for f32x4 {
40 #[inline]
41 fn eq(&self, other: &Self) -> bool {
42 unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
43 }
44
45 }
46 } else {
47 #[derive(Default, Clone, Copy, PartialEq)]
48 #[repr(C, align(16))]
49 pub struct f32x4 { pub(crate) arr: [f32;4] }
50 }
51}
52
53macro_rules! const_f32_as_f32x4 {
54 ($i:ident, $f:expr) => {
55 #[allow(non_upper_case_globals)]
56 pub const $i: f32x4 = f32x4::new([$f; 4]);
57 };
58}
59
60impl f32x4 {
61 const_f32_as_f32x4!(ONE, 1.0);
62 const_f32_as_f32x4!(ZERO, 0.0);
63 const_f32_as_f32x4!(HALF, 0.5);
64 const_f32_as_f32x4!(E, core::f32::consts::E);
65 const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
66 const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
67 const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
68 const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
69 const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
70 const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
71 const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
72 const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
73 const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
74 const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
75 const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
76 const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
77 const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
78 const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
79 const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
80 const_f32_as_f32x4!(PI, core::f32::consts::PI);
81 const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
82 const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
83}
84
85unsafe impl Zeroable for f32x4 {}
86unsafe impl Pod for f32x4 {}
87
88impl AlignTo for f32x4 {
89 type Elem = f32;
90}
91
92impl Add for f32x4 {
93 type Output = Self;
94 #[inline]
95 fn add(self, rhs: Self) -> Self::Output {
96 pick! {
97 if #[cfg(target_feature="sse")] {
98 Self { sse: add_m128(self.sse, rhs.sse) }
99 } else if #[cfg(target_feature="simd128")] {
100 Self { simd: f32x4_add(self.simd, rhs.simd) }
101 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102 unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
103 } else {
104 Self { arr: [
105 self.arr[0] + rhs.arr[0],
106 self.arr[1] + rhs.arr[1],
107 self.arr[2] + rhs.arr[2],
108 self.arr[3] + rhs.arr[3],
109 ]}
110 }
111 }
112 }
113}
114
115impl Sub for f32x4 {
116 type Output = Self;
117 #[inline]
118 fn sub(self, rhs: Self) -> Self::Output {
119 pick! {
120 if #[cfg(target_feature="sse")] {
121 Self { sse: sub_m128(self.sse, rhs.sse) }
122 } else if #[cfg(target_feature="simd128")] {
123 Self { simd: f32x4_sub(self.simd, rhs.simd) }
124 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125 unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126 } else {
127 Self { arr: [
128 self.arr[0] - rhs.arr[0],
129 self.arr[1] - rhs.arr[1],
130 self.arr[2] - rhs.arr[2],
131 self.arr[3] - rhs.arr[3],
132 ]}
133 }
134 }
135 }
136}
137
138impl Mul for f32x4 {
139 type Output = Self;
140 #[inline]
141 fn mul(self, rhs: Self) -> Self::Output {
142 pick! {
143 if #[cfg(target_feature="sse")] {
144 Self { sse: mul_m128(self.sse, rhs.sse) }
145 } else if #[cfg(target_feature="simd128")] {
146 Self { simd: f32x4_mul(self.simd, rhs.simd) }
147 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
148 unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
149 } else {
150 Self { arr: [
151 self.arr[0] * rhs.arr[0],
152 self.arr[1] * rhs.arr[1],
153 self.arr[2] * rhs.arr[2],
154 self.arr[3] * rhs.arr[3],
155 ]}
156 }
157 }
158 }
159}
160
161impl Div for f32x4 {
162 type Output = Self;
163 #[inline]
164 fn div(self, rhs: Self) -> Self::Output {
165 pick! {
166 if #[cfg(target_feature="sse")] {
167 Self { sse: div_m128(self.sse, rhs.sse) }
168 } else if #[cfg(target_feature="simd128")] {
169 Self { simd: f32x4_div(self.simd, rhs.simd) }
170 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
171 unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
172 } else {
173 Self { arr: [
174 self.arr[0] / rhs.arr[0],
175 self.arr[1] / rhs.arr[1],
176 self.arr[2] / rhs.arr[2],
177 self.arr[3] / rhs.arr[3],
178 ]}
179 }
180 }
181 }
182}
183
184impl Neg for f32x4 {
185 type Output = Self;
186 #[inline]
187 fn neg(self) -> Self::Output {
188 pick! {
189 if #[cfg(target_feature="sse")] {
190 Self { sse: bitxor_m128(self.sse, Self::splat(-0.0).sse) }
191 } else if #[cfg(target_feature="simd128")] {
192 Self { simd: f32x4_neg(self.simd) }
193 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
194 unsafe {Self { neon: vnegq_f32(self.neon) }}
195 } else {
196 Self { arr: [
197 -self.arr[0],
198 -self.arr[1],
199 -self.arr[2],
200 -self.arr[3],
201 ]}
202 }
203 }
204 }
205}
206
207impl Add<f32> for f32x4 {
208 type Output = Self;
209 #[inline]
210 fn add(self, rhs: f32) -> Self::Output {
211 self.add(Self::splat(rhs))
212 }
213}
214
215impl Sub<f32> for f32x4 {
216 type Output = Self;
217 #[inline]
218 fn sub(self, rhs: f32) -> Self::Output {
219 self.sub(Self::splat(rhs))
220 }
221}
222
223impl Mul<f32> for f32x4 {
224 type Output = Self;
225 #[inline]
226 fn mul(self, rhs: f32) -> Self::Output {
227 self.mul(Self::splat(rhs))
228 }
229}
230
231impl Div<f32> for f32x4 {
232 type Output = Self;
233 #[inline]
234 fn div(self, rhs: f32) -> Self::Output {
235 self.div(Self::splat(rhs))
236 }
237}
238
239impl Add<f32x4> for f32 {
240 type Output = f32x4;
241 #[inline]
242 fn add(self, rhs: f32x4) -> Self::Output {
243 f32x4::splat(self).add(rhs)
244 }
245}
246
247impl Sub<f32x4> for f32 {
248 type Output = f32x4;
249 #[inline]
250 fn sub(self, rhs: f32x4) -> Self::Output {
251 f32x4::splat(self).sub(rhs)
252 }
253}
254
255impl Mul<f32x4> for f32 {
256 type Output = f32x4;
257 #[inline]
258 fn mul(self, rhs: f32x4) -> Self::Output {
259 f32x4::splat(self).mul(rhs)
260 }
261}
262
263impl Div<f32x4> for f32 {
264 type Output = f32x4;
265 #[inline]
266 fn div(self, rhs: f32x4) -> Self::Output {
267 f32x4::splat(self).div(rhs)
268 }
269}
270
271impl BitAnd for f32x4 {
272 type Output = Self;
273 #[inline]
274 fn bitand(self, rhs: Self) -> Self::Output {
275 pick! {
276 if #[cfg(target_feature="sse")] {
277 Self { sse: bitand_m128(self.sse, rhs.sse) }
278 } else if #[cfg(target_feature="simd128")] {
279 Self { simd: v128_and(self.simd, rhs.simd) }
280 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
281 unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
282 } else {
283 Self { arr: [
284 f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
285 f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
286 f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
287 f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
288 ]}
289 }
290 }
291 }
292}
293
294impl BitOr for f32x4 {
295 type Output = Self;
296 #[inline]
297 fn bitor(self, rhs: Self) -> Self::Output {
298 pick! {
299 if #[cfg(target_feature="sse")] {
300 Self { sse: bitor_m128(self.sse, rhs.sse) }
301 } else if #[cfg(target_feature="simd128")] {
302 Self { simd: v128_or(self.simd, rhs.simd) }
303 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
304 unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
305 } else {
306 Self { arr: [
307 f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
308 f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
309 f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
310 f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
311 ]}
312 }
313 }
314 }
315}
316
317impl BitXor for f32x4 {
318 type Output = Self;
319 #[inline]
320 fn bitxor(self, rhs: Self) -> Self::Output {
321 pick! {
322 if #[cfg(target_feature="sse")] {
323 Self { sse: bitxor_m128(self.sse, rhs.sse) }
324 } else if #[cfg(target_feature="simd128")] {
325 Self { simd: v128_xor(self.simd, rhs.simd) }
326 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
327 unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
328 } else {
329 Self { arr: [
330 f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
331 f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
332 f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
333 f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
334 ]}
335 }
336 }
337 }
338}
339
340impl CmpEq for f32x4 {
341 type Output = Self;
342 #[inline]
343 fn simd_eq(self, rhs: Self) -> Self::Output {
344 pick! {
345 if #[cfg(target_feature="sse")] {
346 Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
347 } else if #[cfg(target_feature="simd128")] {
348 Self { simd: f32x4_eq(self.simd, rhs.simd) }
349 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
350 unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
351 } else {
352 Self { arr: [
353 if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
354 if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
355 if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
356 if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
357 ]}
358 }
359 }
360 }
361}
362
363impl CmpGe for f32x4 {
364 type Output = Self;
365 #[inline]
366 fn simd_ge(self, rhs: Self) -> Self::Output {
367 pick! {
368 if #[cfg(target_feature="sse")] {
369 Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
370 } else if #[cfg(target_feature="simd128")] {
371 Self { simd: f32x4_ge(self.simd, rhs.simd) }
372 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
373 unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
374 } else {
375 Self { arr: [
376 if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
377 if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
378 if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
379 if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
380 ]}
381 }
382 }
383 }
384}
385
386impl CmpGt for f32x4 {
387 type Output = Self;
388 #[inline]
389 fn simd_gt(self, rhs: Self) -> Self::Output {
390 pick! {
391 if #[cfg(target_feature="sse")] {
392 Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
393 } else if #[cfg(target_feature="simd128")] {
394 Self { simd: f32x4_gt(self.simd, rhs.simd) }
395 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396 unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
397 } else {
398 Self { arr: [
399 if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
400 if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
401 if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
402 if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
403 ]}
404 }
405 }
406 }
407}
408
409impl CmpNe for f32x4 {
410 type Output = Self;
411 #[inline]
412 fn simd_ne(self, rhs: Self) -> Self::Output {
413 pick! {
414 if #[cfg(target_feature="sse")] {
415 Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
416 } else if #[cfg(target_feature="simd128")] {
417 Self { simd: f32x4_ne(self.simd, rhs.simd) }
418 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
419 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
420 } else {
421 Self { arr: [
422 if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
423 if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
424 if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
425 if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
426 ]}
427 }
428 }
429 }
430}
431
432impl CmpLe for f32x4 {
433 type Output = Self;
434 #[inline]
435 fn simd_le(self, rhs: Self) -> Self::Output {
436 pick! {
437 if #[cfg(target_feature="sse")] {
438 Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
439 } else if #[cfg(target_feature="simd128")] {
440 Self { simd: f32x4_le(self.simd, rhs.simd) }
441 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
442 unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
443 } else {
444 Self { arr: [
445 if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
446 if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
447 if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
448 if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
449 ]}
450 }
451 }
452 }
453}
454
455impl CmpLt for f32x4 {
456 type Output = Self;
457 #[inline]
458 fn simd_lt(self, rhs: Self) -> Self::Output {
459 pick! {
460 if #[cfg(target_feature="sse")] {
461 Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
462 } else if #[cfg(target_feature="simd128")] {
463 Self { simd: f32x4_lt(self.simd, rhs.simd) }
464 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
465 unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
466 } else {
467 Self { arr: [
468 if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
469 if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
470 if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
471 if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
472 ]}
473 }
474 }
475 }
476}
477
478impl f32x4 {
479 #[inline]
480 #[must_use]
481 pub const fn new(array: [f32; 4]) -> Self {
482 #[allow(non_upper_case_globals)]
483 unsafe {
484 core::mem::transmute(array)
485 }
486 }
487
488 #[inline]
489 #[must_use]
490 pub fn blend(self, t: Self, f: Self) -> Self {
491 pick! {
492 if #[cfg(target_feature="sse4.1")] {
493 Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
494 } else if #[cfg(target_feature="simd128")] {
495 Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
496 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
497 unsafe {Self { neon: vbslq_f32(vreinterpretq_u32_f32(self.neon), t.neon, f.neon) }}
498 } else {
499 generic_bit_blend(self, t, f)
500 }
501 }
502 }
503 #[inline]
504 #[must_use]
505 pub fn abs(self) -> Self {
506 pick! {
507 if #[cfg(target_feature="simd128")] {
508 Self { simd: f32x4_abs(self.simd) }
509 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
510 unsafe {Self { neon: vabsq_f32(self.neon) }}
511 } else {
512 let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
513 self & non_sign_bits
514 }
515 }
516 }
517 #[inline]
518 #[must_use]
519 pub fn floor(self) -> Self {
520 pick! {
521 if #[cfg(target_feature="simd128")] {
522 Self { simd: f32x4_floor(self.simd) }
523 } else if #[cfg(target_feature="sse4.1")] {
524 Self { sse: floor_m128(self.sse) }
525 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
526 unsafe {Self { neon: vrndmq_f32(self.neon) }}
527 } else if #[cfg(feature="std")] {
528 let base: [f32; 4] = cast(self);
529 cast(base.map(|val| val.floor()))
530 } else {
531 let base: [f32; 4] = cast(self);
532 let rounded: [f32; 4] = cast(self.round());
533 cast([
534 if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] },
535 if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] },
536 if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] },
537 if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] },
538 ])
539 }
540 }
541 }
542 #[inline]
543 #[must_use]
544 pub fn ceil(self) -> Self {
545 pick! {
546 if #[cfg(target_feature="simd128")] {
547 Self { simd: f32x4_ceil(self.simd) }
548 } else if #[cfg(target_feature="sse4.1")] {
549 Self { sse: ceil_m128(self.sse) }
550 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
551 unsafe {Self { neon: vrndpq_f32(self.neon) }}
552 } else if #[cfg(feature="std")] {
553 let base: [f32; 4] = cast(self);
554 cast(base.map(|val| val.ceil()))
555 } else {
556 let base: [f32; 4] = cast(self);
557 let rounded: [f32; 4] = cast(self.round());
558 cast([
559 if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] },
560 if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] },
561 if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] },
562 if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] },
563 ])
564 }
565 }
566 }
567
568 #[inline]
572 #[must_use]
573 pub fn fast_max(self, rhs: Self) -> Self {
574 pick! {
575 if #[cfg(target_feature="sse")] {
576 Self { sse: max_m128(self.sse, rhs.sse) }
577 } else if #[cfg(target_feature="simd128")] {
578 Self {
579 simd: f32x4_pmax(self.simd, rhs.simd),
580 }
581 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
582 unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
583 } else {
584 Self { arr: [
585 if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
586 if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
587 if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
588 if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
589 ]}
590 }
591 }
592 }
593
594 #[inline]
598 #[must_use]
599 pub fn max(self, rhs: Self) -> Self {
600 pick! {
601 if #[cfg(target_feature="sse")] {
602 rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
606 } else if #[cfg(target_feature="simd128")] {
607 Self {
614 simd: v128_bitselect(
615 rhs.simd,
616 f32x4_pmax(self.simd, rhs.simd),
617 f32x4_ne(self.simd, self.simd), )
619 }
620 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
621 unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
622 } else {
623 Self { arr: [
624 self.arr[0].max(rhs.arr[0]),
625 self.arr[1].max(rhs.arr[1]),
626 self.arr[2].max(rhs.arr[2]),
627 self.arr[3].max(rhs.arr[3]),
628 ]}
629 }
630 }
631 }
632
633 #[inline]
637 #[must_use]
638 pub fn fast_min(self, rhs: Self) -> Self {
639 pick! {
640 if #[cfg(target_feature="sse")] {
641 Self { sse: min_m128(self.sse, rhs.sse) }
642 } else if #[cfg(target_feature="simd128")] {
643 Self {
644 simd: f32x4_pmin(self.simd, rhs.simd),
645 }
646 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
647 unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
648 } else {
649 Self { arr: [
650 if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
651 if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
652 if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
653 if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
654 ]}
655 }
656 }
657 }
658
659 #[inline]
663 #[must_use]
664 pub fn min(self, rhs: Self) -> Self {
665 pick! {
666 if #[cfg(target_feature="sse")] {
667 rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
671 } else if #[cfg(target_feature="simd128")] {
672 Self {
679 simd: v128_bitselect(
680 rhs.simd,
681 f32x4_pmin(self.simd, rhs.simd),
682 f32x4_ne(self.simd, self.simd), )
684 }
685 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
686 unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
687 } else {
688 Self { arr: [
689 self.arr[0].min(rhs.arr[0]),
690 self.arr[1].min(rhs.arr[1]),
691 self.arr[2].min(rhs.arr[2]),
692 self.arr[3].min(rhs.arr[3]),
693 ]}
694 }
695 }
696 }
697 #[inline]
698 #[must_use]
699 pub fn is_nan(self) -> Self {
700 pick! {
701 if #[cfg(target_feature="sse")] {
702 Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
703 } else if #[cfg(target_feature="simd128")] {
704 Self { simd: f32x4_ne(self.simd, self.simd) }
705 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
706 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
707 } else {
708 Self { arr: [
709 if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
710 if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
711 if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
712 if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
713 ]}
714 }
715 }
716 }
717 #[inline]
718 #[must_use]
719 pub fn is_finite(self) -> Self {
720 let shifted_exp_mask = u32x4::from(0xFF000000);
721 let u: u32x4 = cast(self);
722 let shift_u = u << 1_u64;
723 let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
724 cast(out)
725 }
726 #[inline]
727 #[must_use]
728 pub fn is_inf(self) -> Self {
729 let shifted_inf = u32x4::from(0xFF000000);
730 let u: u32x4 = cast(self);
731 let shift_u = u << 1_u64;
732 let out = (shift_u).simd_eq(shifted_inf);
733 cast(out)
734 }
735
736 #[inline]
737 #[must_use]
738 pub fn round(self) -> Self {
739 pick! {
740 if #[cfg(target_feature="sse4.1")] {
741 Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
742 } else if #[cfg(target_feature="sse2")] {
743 let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
744 let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
745 let i: i32x4 = cast(mi);
746 let mask: f32x4 = cast(i.simd_eq(i32x4::from(0x80000000_u32 as i32)));
747 mask.blend(self, f)
748 } else if #[cfg(target_feature="simd128")] {
749 Self { simd: f32x4_nearest(self.simd) }
750 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
751 unsafe {Self { neon: vrndnq_f32(self.neon) }}
752 } else {
753 let to_int = f32x4::from(1.0 / f32::EPSILON);
757 let u: u32x4 = cast(self);
758 let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
759 let mut y: f32x4;
760
761 let no_op_magic = i32x4::from(0x7f + 23);
762 let no_op_mask: f32x4 = cast(e.simd_gt(no_op_magic) | e.simd_eq(no_op_magic));
763 let no_op_val: f32x4 = self;
764
765 let zero_magic = i32x4::from(0x7f - 1);
766 let zero_mask: f32x4 = cast(e.simd_lt(zero_magic));
767 let zero_val: f32x4 = self * f32x4::from(0.0);
768
769 let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).simd_lt(i32x4::default()));
770 let x: f32x4 = neg_bit.blend(-self, self);
771 y = x + to_int - to_int - x;
772 y = y.simd_gt(f32x4::from(0.5)).blend(
773 y + x - f32x4::from(-1.0),
774 y.simd_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
775 );
776 y = neg_bit.blend(-y, y);
777
778 no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
779 }
780 }
781 }
782
783 #[inline]
787 #[must_use]
788 pub fn fast_round_int(self) -> i32x4 {
789 pick! {
790 if #[cfg(target_feature="sse2")] {
791 cast(convert_to_i32_m128i_from_m128(self.sse))
792 } else {
793 self.round_int()
794 }
795 }
796 }
797
798 #[inline]
802 #[must_use]
803 pub fn round_int(self) -> i32x4 {
804 pick! {
805 if #[cfg(target_feature="sse2")] {
806 let non_nan_mask = self.simd_eq(self);
808 let non_nan = self & non_nan_mask;
809 let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
810 let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
811 flip_to_max ^ cast
812 } else if #[cfg(target_feature="simd128")] {
813 cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
814 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
815 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
816 } else {
817 let rounded: [f32; 4] = cast(self.round());
818 cast([
819 rounded[0] as i32,
820 rounded[1] as i32,
821 rounded[2] as i32,
822 rounded[3] as i32,
823 ])
824 }
825 }
826 }
827
828 #[inline]
832 #[must_use]
833 pub fn fast_trunc_int(self) -> i32x4 {
834 pick! {
835 if #[cfg(target_feature="sse2")] {
836 cast(truncate_m128_to_m128i(self.sse))
837 } else {
838 self.trunc_int()
839 }
840 }
841 }
842
843 #[inline]
847 #[must_use]
848 pub fn trunc_int(self) -> i32x4 {
849 pick! {
850 if #[cfg(target_feature="sse2")] {
851 let non_nan_mask = self.simd_eq(self);
853 let non_nan = self & non_nan_mask;
854 let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
855 let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
856 flip_to_max ^ cast
857 } else if #[cfg(target_feature="simd128")] {
858 cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
859 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
860 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
861 } else {
862 let n: [f32;4] = cast(self);
863 cast([
864 n[0] as i32,
865 n[1] as i32,
866 n[2] as i32,
867 n[3] as i32,
868 ])
869 }
870 }
871 }
872 #[inline]
897 #[must_use]
898 pub fn mul_add(self, m: Self, a: Self) -> Self {
899 pick! {
900 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
901 Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
902 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
903 unsafe { Self { neon: vfmaq_f32(a.neon, self.neon, m.neon) } }
904 } else {
905 (self * m) + a
906 }
907 }
908 }
909
910 #[inline]
936 #[must_use]
937 pub fn mul_sub(self, m: Self, s: Self) -> Self {
938 pick! {
939 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
940 Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
941 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
942 unsafe { Self { neon: vfmaq_f32(vnegq_f32(s.neon), self.neon, m.neon) } }
943 } else {
944 (self * m) - s
945 }
946 }
947 }
948
949 #[inline]
974 #[must_use]
975 pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
976 pick! {
977 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
978 Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
979 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
980 unsafe { Self { neon: vfmsq_f32(a.neon, self.neon, m.neon) } }
981 } else {
982 a - (self * m)
983 }
984 }
985 }
986
987 #[inline]
1013 #[must_use]
1014 pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
1015 pick! {
1016 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
1017 Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, s.sse) }
1018 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1019 unsafe { Self { neon: vnegq_f32(vfmaq_f32(s.neon, self.neon, m.neon)) } }
1020 } else {
1021 -(self * m) - s
1022 }
1023 }
1024 }
1025
1026 #[inline]
1027 #[must_use]
1028 pub fn flip_signs(self, signs: Self) -> Self {
1029 self ^ (signs & Self::from(-0.0))
1030 }
1031
1032 #[inline]
1033 #[must_use]
1034 pub fn copysign(self, sign: Self) -> Self {
1035 let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
1036 (self & magnitude_mask) | (sign & Self::from(-0.0))
1037 }
1038
1039 #[inline]
1040 pub fn asin_acos(self) -> (Self, Self) {
1041 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1044 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1045 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1046 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1047 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1048
1049 let xa = self.abs();
1050 let big = xa.simd_ge(f32x4::splat(0.5));
1051
1052 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1053 let x2 = xa * xa;
1054 let x3 = big.blend(x1, x2);
1055
1056 let xb = x1.sqrt();
1057
1058 let x4 = big.blend(xb, xa);
1059
1060 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1061 let z = z.mul_add(x3 * x4, x4);
1062
1063 let z1 = z + z;
1064
1065 let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1067 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1068 let acos = big.blend(z3, z4);
1069
1070 let z3 = f32x4::FRAC_PI_2 - z1;
1072 let asin = big.blend(z3, z);
1073 let asin = asin.flip_signs(self);
1074
1075 (asin, acos)
1076 }
1077
1078 #[inline]
1079 pub fn asin(self) -> Self {
1080 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1083 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1084 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1085 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1086 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1087
1088 let xa = self.abs();
1089 let big = xa.simd_ge(f32x4::splat(0.5));
1090
1091 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1092 let x2 = xa * xa;
1093 let x3 = big.blend(x1, x2);
1094
1095 let xb = x1.sqrt();
1096
1097 let x4 = big.blend(xb, xa);
1098
1099 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1100 let z = z.mul_add(x3 * x4, x4);
1101
1102 let z1 = z + z;
1103
1104 let z3 = f32x4::FRAC_PI_2 - z1;
1106 let asin = big.blend(z3, z);
1107 let asin = asin.flip_signs(self);
1108
1109 asin
1110 }
1111
1112 #[inline]
1113 #[must_use]
1114 pub fn acos(self) -> Self {
1115 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1118 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1119 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1120 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1121 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1122
1123 let xa = self.abs();
1124 let big = xa.simd_ge(f32x4::splat(0.5));
1125
1126 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1127 let x2 = xa * xa;
1128 let x3 = big.blend(x1, x2);
1129
1130 let xb = x1.sqrt();
1131
1132 let x4 = big.blend(xb, xa);
1133
1134 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1135 let z = z.mul_add(x3 * x4, x4);
1136
1137 let z1 = z + z;
1138
1139 let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1141 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1142 let acos = big.blend(z3, z4);
1143
1144 acos
1145 }
1146
1147 #[inline]
1148 pub fn atan(self) -> Self {
1149 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1152 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1153 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1154 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1155
1156 let t = self.abs();
1157
1158 let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1162 let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
1163
1164 let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1165 s = notsmal & s;
1166
1167 let mut a = notbig & t;
1168 a = notsmal.blend(a - Self::ONE, a);
1169 let mut b = notbig & Self::ONE;
1170 b = notsmal.blend(b + t, b);
1171 let z = a / b;
1172
1173 let zz = z * z;
1174
1175 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1177 re = re.mul_add(zz * z, z) + s;
1178
1179 re = (self.sign_bit()).blend(-re, re);
1181
1182 re
1183 }
1184
1185 #[inline]
1186 pub fn atan2(self, x: Self) -> Self {
1187 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1190 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1191 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1192 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1193
1194 let y = self;
1195
1196 let x1 = x.abs();
1198 let y1 = y.abs();
1199 let swapxy = y1.simd_gt(x1);
1200 let mut x2 = swapxy.blend(y1, x1);
1202 let mut y2 = swapxy.blend(x1, y1);
1203
1204 let both_infinite = x.is_inf() & y.is_inf();
1206 if both_infinite.any() {
1207 let minus_one = -Self::ONE;
1208 x2 = both_infinite.blend(x2 & minus_one, x2);
1209 y2 = both_infinite.blend(y2 & minus_one, y2);
1210 }
1211
1212 let t = y2 / x2;
1214
1215 let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1218
1219 let a = notsmal.blend(t - Self::ONE, t);
1220 let b = notsmal.blend(t + Self::ONE, Self::ONE);
1221 let s = notsmal & Self::FRAC_PI_4;
1222 let z = a / b;
1223
1224 let zz = z * z;
1225
1226 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1228 re = re.mul_add(zz * z, z) + s;
1229
1230 re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1232 re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1233 re = (x.sign_bit()).blend(Self::PI - re, re);
1234
1235 re = (y.sign_bit()).blend(-re, re);
1237
1238 re
1239 }
1240
1241 #[inline]
1242 #[must_use]
1243 pub fn sin_cos(self) -> (Self, Self) {
1244 const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1248 const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1249 const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1250
1251 const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1252 const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1253 const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1254
1255 const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1256 const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1257 const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1258
1259 const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1260
1261 let xa = self.abs();
1262
1263 let y = (xa * TWO_OVER_PI).round();
1265 let q: i32x4 = y.round_int();
1266
1267 let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1268
1269 let x2 = x * x;
1270 let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1271 let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1272 + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1273
1274 let swap = !(q & i32x4::from(1)).simd_eq(i32x4::from(0));
1275
1276 let mut overflow: f32x4 = cast(q.simd_gt(i32x4::from(0x2000000)));
1277 overflow &= xa.is_finite();
1278 s = overflow.blend(f32x4::from(0.0), s);
1279 c = overflow.blend(f32x4::from(1.0), c);
1280
1281 let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1283 let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1284 sin1 = sin1.flip_signs(cast(sign_sin));
1285
1286 let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1288 let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1289 cos1 ^= cast::<_, f32x4>(sign_cos);
1290
1291 (sin1, cos1)
1292 }
1293
1294 #[inline]
1295 #[must_use]
1296 pub fn sin(self) -> Self {
1297 let (s, _) = self.sin_cos();
1298 s
1299 }
1300 #[inline]
1301 #[must_use]
1302 pub fn cos(self) -> Self {
1303 let (_, c) = self.sin_cos();
1304 c
1305 }
1306 #[inline]
1307 #[must_use]
1308 pub fn tan(self) -> Self {
1309 let (s, c) = self.sin_cos();
1310 s / c
1311 }
1312 #[inline]
1313 #[must_use]
1314 pub fn to_degrees(self) -> Self {
1315 const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1316 self * RAD_TO_DEG_RATIO
1317 }
1318 #[inline]
1319 #[must_use]
1320 pub fn to_radians(self) -> Self {
1321 const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1322 self * DEG_TO_RAD_RATIO
1323 }
1324 #[inline]
1325 #[must_use]
1326 pub fn recip(self) -> Self {
1327 pick! {
1328 if #[cfg(target_feature="sse")] {
1329 Self { sse: reciprocal_m128(self.sse) }
1330 } else if #[cfg(target_feature="simd128")] {
1331 Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1332 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1333 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1334 } else {
1335 Self { arr: [
1336 1.0 / self.arr[0],
1337 1.0 / self.arr[1],
1338 1.0 / self.arr[2],
1339 1.0 / self.arr[3],
1340 ]}
1341 }
1342 }
1343 }
1344 #[inline]
1345 #[must_use]
1346 pub fn recip_sqrt(self) -> Self {
1347 pick! {
1348 if #[cfg(target_feature="sse")] {
1349 Self { sse: reciprocal_sqrt_m128(self.sse) }
1350 } else if #[cfg(target_feature="simd128")] {
1351 Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1352 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1353 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1354 } else if #[cfg(feature="std")] {
1355 Self { arr: [
1356 1.0 / self.arr[0].sqrt(),
1357 1.0 / self.arr[1].sqrt(),
1358 1.0 / self.arr[2].sqrt(),
1359 1.0 / self.arr[3].sqrt(),
1360 ]}
1361 } else {
1362 Self { arr: [
1363 1.0 / software_sqrt(self.arr[0] as f64) as f32,
1364 1.0 / software_sqrt(self.arr[1] as f64) as f32,
1365 1.0 / software_sqrt(self.arr[2] as f64) as f32,
1366 1.0 / software_sqrt(self.arr[3] as f64) as f32,
1367 ]}
1368 }
1369 }
1370 }
1371 #[inline]
1372 #[must_use]
1373 pub fn sqrt(self) -> Self {
1374 pick! {
1375 if #[cfg(target_feature="sse")] {
1376 Self { sse: sqrt_m128(self.sse) }
1377 } else if #[cfg(target_feature="simd128")] {
1378 Self { simd: f32x4_sqrt(self.simd) }
1379 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1380 unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1381 } else if #[cfg(feature="std")] {
1382 Self { arr: [
1383 self.arr[0].sqrt(),
1384 self.arr[1].sqrt(),
1385 self.arr[2].sqrt(),
1386 self.arr[3].sqrt(),
1387 ]}
1388 } else {
1389 Self { arr: [
1390 software_sqrt(self.arr[0] as f64) as f32,
1391 software_sqrt(self.arr[1] as f64) as f32,
1392 software_sqrt(self.arr[2] as f64) as f32,
1393 software_sqrt(self.arr[3] as f64) as f32,
1394 ]}
1395 }
1396 }
1397 }
1398
1399 #[inline]
1400 #[must_use]
1401 #[doc(alias("movemask", "move_mask"))]
1402 pub fn to_bitmask(self) -> u32 {
1403 pick! {
1404 if #[cfg(target_feature="sse")] {
1405 move_mask_m128(self.sse) as u32
1406 } else if #[cfg(target_feature="simd128")] {
1407 u32x4_bitmask(self.simd) as u32
1408 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1409 unsafe
1410 {
1411 let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1413
1414 let selectbit : uint32x4_t = core::mem::transmute([1u32, 2, 4, 8]);
1416 let r = vandq_u32(masked, selectbit);
1417
1418 vaddvq_u32(r) as u32
1420 }
1421 } else {
1422 (((self.arr[0].to_bits() as i32) < 0) as u32) << 0 |
1423 (((self.arr[1].to_bits() as i32) < 0) as u32) << 1 |
1424 (((self.arr[2].to_bits() as i32) < 0) as u32) << 2 |
1425 (((self.arr[3].to_bits() as i32) < 0) as u32) << 3
1426 }
1427 }
1428 }
1429 #[inline]
1430 #[must_use]
1431 pub fn any(self) -> bool {
1432 pick! {
1433 if #[cfg(target_feature="simd128")] {
1434 v128_any_true(self.simd)
1435 } else {
1436 self.to_bitmask() != 0
1437 }
1438 }
1439 }
1440 #[inline]
1441 #[must_use]
1442 pub fn all(self) -> bool {
1443 pick! {
1444 if #[cfg(target_feature="simd128")] {
1445 u32x4_all_true(self.simd)
1446 } else {
1447 self.to_bitmask() == 0b1111
1449 }
1450 }
1451 }
1452 #[inline]
1453 #[must_use]
1454 pub fn none(self) -> bool {
1455 !self.any()
1456 }
1457
1458 #[inline]
1459 fn vm_pow2n(self) -> Self {
1460 const_f32_as_f32x4!(pow2_23, 8388608.0);
1461 const_f32_as_f32x4!(bias, 127.0);
1462 let a = self + (bias + pow2_23);
1463 let c = cast::<_, i32x4>(a) << 23;
1464 cast::<_, f32x4>(c)
1465 }
1466
1467 #[inline]
1469 #[must_use]
1470 pub fn exp(self) -> Self {
1471 const_f32_as_f32x4!(P0, 1.0 / 2.0);
1472 const_f32_as_f32x4!(P1, 1.0 / 6.0);
1473 const_f32_as_f32x4!(P2, 1. / 24.);
1474 const_f32_as_f32x4!(P3, 1. / 120.);
1475 const_f32_as_f32x4!(P4, 1. / 720.);
1476 const_f32_as_f32x4!(P5, 1. / 5040.);
1477 const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1478 const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1479 let max_x = f32x4::from(87.3);
1480 let r = (self * Self::LOG2_E).round();
1481 let x = r.mul_neg_add(LN2D_HI, self);
1482 let x = r.mul_neg_add(LN2D_LO, x);
1483 let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1484 let x2 = x * x;
1485 let z = z.mul_add(x2, x);
1486 let n2 = Self::vm_pow2n(r);
1487 let z = (z + Self::ONE) * n2;
1488 let in_range = self.abs().simd_lt(max_x);
1490 let in_range = in_range & self.is_finite();
1491 in_range.blend(z, Self::ZERO)
1492 }
1493
1494 #[inline]
1495 fn exponent(self) -> f32x4 {
1496 const_f32_as_f32x4!(pow2_23, 8388608.0);
1497 const_f32_as_f32x4!(bias, 127.0);
1498 let a = cast::<_, u32x4>(self);
1499 let b = a >> 23;
1500 let c = b | cast::<_, u32x4>(pow2_23);
1501 let d = cast::<_, f32x4>(c);
1502 let e = d - (pow2_23 + bias);
1503 e
1504 }
1505
1506 #[inline]
1507 fn fraction_2(self) -> Self {
1508 let t1 = cast::<_, u32x4>(self);
1509 let t2 = cast::<_, u32x4>(
1510 (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1511 );
1512 cast::<_, f32x4>(t2)
1513 }
1514 #[inline]
1515 fn is_zero_or_subnormal(self) -> Self {
1516 let t = cast::<_, i32x4>(self);
1517 let t = t & i32x4::splat(0x7F800000);
1518 i32x4::round_float(t.simd_eq(i32x4::splat(0)))
1519 }
1520 #[inline]
1521 fn infinity() -> Self {
1522 cast::<_, f32x4>(i32x4::splat(0x7F800000))
1523 }
1524 #[inline]
1525 fn nan_log() -> Self {
1526 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1527 }
1528 #[inline]
1529 fn nan_pow() -> Self {
1530 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1531 }
1532 #[inline]
1533 pub fn sign_bit(self) -> Self {
1534 let t1 = cast::<_, i32x4>(self);
1535 let t2 = t1 >> 31;
1536 !cast::<_, f32x4>(t2).simd_eq(f32x4::ZERO)
1537 }
1538
1539 #[inline]
1541 #[must_use]
1542 pub fn reduce_add(self) -> f32 {
1543 let arr: [f32; 4] = cast(self);
1544 arr.iter().sum()
1545 }
1546
1547 #[inline]
1549 #[must_use]
1550 pub fn ln(self) -> Self {
1551 const_f32_as_f32x4!(HALF, 0.5);
1552 const_f32_as_f32x4!(P0, 3.3333331174E-1);
1553 const_f32_as_f32x4!(P1, -2.4999993993E-1);
1554 const_f32_as_f32x4!(P2, 2.0000714765E-1);
1555 const_f32_as_f32x4!(P3, -1.6668057665E-1);
1556 const_f32_as_f32x4!(P4, 1.4249322787E-1);
1557 const_f32_as_f32x4!(P5, -1.2420140846E-1);
1558 const_f32_as_f32x4!(P6, 1.1676998740E-1);
1559 const_f32_as_f32x4!(P7, -1.1514610310E-1);
1560 const_f32_as_f32x4!(P8, 7.0376836292E-2);
1561 const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1562 const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1563 const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1564
1565 let x1 = self;
1566 let x = Self::fraction_2(x1);
1567 let e = Self::exponent(x1);
1568 let mask = x.simd_gt(Self::SQRT_2 * HALF);
1569 let x = (!mask).blend(x + x, x);
1570 let fe = mask.blend(e + Self::ONE, e);
1571 let x = x - Self::ONE;
1572 let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1573 let x2 = x * x;
1574 let res = x2 * x * res;
1575 let res = fe.mul_add(LN2F_LO, res);
1576 let res = res + x2.mul_neg_add(HALF, x);
1577 let res = fe.mul_add(LN2F_HI, res);
1578 let overflow = !self.is_finite();
1579 let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1580 let mask = overflow | underflow;
1581 if !mask.any() {
1582 res
1583 } else {
1584 let is_zero = self.is_zero_or_subnormal();
1585 let res = underflow.blend(Self::nan_log(), res);
1586 let res = is_zero.blend(Self::infinity(), res);
1587 let res = overflow.blend(self, res);
1588 res
1589 }
1590 }
1591
1592 #[inline]
1593 #[must_use]
1594 pub fn log2(self) -> Self {
1595 Self::ln(self) * Self::LOG2_E
1596 }
1597 #[inline]
1598 #[must_use]
1599 pub fn log10(self) -> Self {
1600 Self::ln(self) * Self::LOG10_E
1601 }
1602
1603 #[inline]
1604 #[must_use]
1605 pub fn pow_f32x4(self, y: f32x4) -> Self {
1606 const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1607 const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1608 const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1609 const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1610 const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1611 const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1612 const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1613 const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1614 const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1615 const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1616 const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1617
1618 const_f32_as_f32x4!(p2expf, 1.0 / 2.0); const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1620 const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1621 const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1622 const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1623 const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1624
1625 let x1 = self.abs();
1626 let x = x1.fraction_2();
1627
1628 let mask = x.simd_gt(f32x4::SQRT_2 * f32x4::HALF);
1629 let x = (!mask).blend(x + x, x);
1630
1631 let x = x - f32x4::ONE;
1632 let x2 = x * x;
1633 let lg1 = polynomial_8!(
1634 x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1635 );
1636 let lg1 = lg1 * x2 * x;
1637
1638 let ef = x1.exponent();
1639 let ef = mask.blend(ef + f32x4::ONE, ef);
1640
1641 let e1 = (ef * y).round();
1642 let yr = ef.mul_sub(y, e1);
1643
1644 let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1645 let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1646 let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1647
1648 let e2 = (lg * y * f32x4::LOG2_E).round();
1649 let v = lg.mul_sub(y, e2 * ln2f_hi);
1650 let v = e2.mul_neg_add(ln2f_lo, v);
1651 let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1652
1653 let x = v;
1654 let e3 = (x * f32x4::LOG2_E).round();
1655 let x = e3.mul_neg_add(f32x4::LN_2, x);
1656 let x2 = x * x;
1657 let z = x2.mul_add(
1658 polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1659 x + f32x4::ONE,
1660 );
1661
1662 let ee = e1 + e2 + e3;
1663 let ei = cast::<_, i32x4>(ee.round_int());
1664 let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1665
1666 let overflow = cast::<_, f32x4>(ej.simd_gt(i32x4::splat(0x0FF)))
1667 | (ee.simd_gt(f32x4::splat(300.0)));
1668 let underflow = cast::<_, f32x4>(ej.simd_lt(i32x4::splat(0x000)))
1669 | (ee.simd_lt(f32x4::splat(-300.0)));
1670
1671 let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1673
1674 let z = if (overflow | underflow).any() {
1676 let z = underflow.blend(f32x4::ZERO, z);
1677 overflow.blend(Self::infinity(), z)
1678 } else {
1679 z
1680 };
1681
1682 let x_zero = self.is_zero_or_subnormal();
1684 let z = x_zero.blend(
1685 y.simd_lt(f32x4::ZERO).blend(
1686 Self::infinity(),
1687 y.simd_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1688 ),
1689 z,
1690 );
1691
1692 let x_sign = self.sign_bit();
1693 let z = if x_sign.any() {
1694 let yi = y.simd_eq(y.round());
1696 let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1698
1699 let z1 =
1700 yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1701 x_sign.blend(z1, z)
1702 } else {
1703 z
1704 };
1705
1706 let x_finite = self.is_finite();
1707 let y_finite = y.is_finite();
1708 let e_finite = ee.is_finite();
1709 if (x_finite & y_finite & (e_finite | x_zero)).all() {
1710 return z;
1711 }
1712
1713 (self.is_nan() | y.is_nan()).blend(self + y, z)
1714 }
1715
1716 #[inline]
1717 pub fn powf(self, y: f32) -> Self {
1718 Self::pow_f32x4(self, f32x4::splat(y))
1719 }
1720
1721 #[must_use]
1722 #[inline]
1723 pub fn unpack_lo(self, b: Self) -> Self {
1724 pick! {
1725 if #[cfg(target_feature="sse")] {
1726 Self { sse: unpack_low_m128(self.sse, b.sse) }
1727 } else if #[cfg(target_feature="simd128")] {
1728 Self {
1729 simd: u32x4_shuffle::<0, 4, 1, 5>(self.simd, b.simd)
1730 }
1731 } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))]{
1732 unsafe {Self { neon: vzip1q_f32(self.neon, b.neon) }}
1733 } else {
1734 Self { arr: [
1735 self.arr[0],
1736 b.arr[0],
1737 self.arr[1],
1738 b.arr[1],
1739 ]}
1740 }
1741 }
1742 }
1743
1744 #[must_use]
1745 #[inline]
1746 pub fn unpack_hi(self, b: Self) -> Self {
1747 pick! {
1748 if #[cfg(target_feature="sse")] {
1749 Self { sse: unpack_high_m128(self.sse, b.sse) }
1750 } else if #[cfg(target_feature="simd128")] {
1751 Self {
1752 simd: u32x4_shuffle::<2, 6, 3, 7>(self.simd, b.simd)
1753 }
1754 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1755 unsafe {Self { neon: vzip2q_f32(self.neon, b.neon) }}
1756 } else {
1757 Self { arr: [
1758 self.arr[2],
1759 b.arr[2],
1760 self.arr[3],
1761 b.arr[3],
1762 ]}
1763 }
1764 }
1765 }
1766
1767 #[must_use]
1769 #[inline]
1770 pub fn transpose(data: [f32x4; 4]) -> [f32x4; 4] {
1771 pick! {
1772 if #[cfg(target_feature="sse")] {
1773 let mut e0 = data[0];
1774 let mut e1 = data[1];
1775 let mut e2 = data[2];
1776 let mut e3 = data[3];
1777
1778 transpose_four_m128(&mut e0.sse, &mut e1.sse, &mut e2.sse, &mut e3.sse);
1779
1780 [e0, e1, e2, e3]
1781 } else if #[cfg(any(all(target_feature="neon",target_arch="aarch64"), target_feature="simd128"))] {
1782 let a = data[0].unpack_lo(data[2]);
1783 let b = data[1].unpack_lo(data[3]);
1784 let c = data[0].unpack_hi(data[2]);
1785 let d = data[1].unpack_hi(data[3]);
1786
1787 [
1788 a.unpack_lo(b),
1789 a.unpack_hi(b),
1790 c.unpack_lo(d),
1791 c.unpack_hi(d),
1792 ]
1793 } else {
1794 #[inline(always)]
1795 fn transpose_column(data: &[f32x4; 4], index: usize) -> f32x4 {
1796 f32x4::new([
1797 data[0].as_array()[index],
1798 data[1].as_array()[index],
1799 data[2].as_array()[index],
1800 data[3].as_array()[index],
1801 ])
1802 }
1803
1804 [
1805 transpose_column(&data, 0),
1806 transpose_column(&data, 1),
1807 transpose_column(&data, 2),
1808 transpose_column(&data, 3),
1809 ]
1810 }
1811 }
1812 }
1813
1814 #[inline]
1815 pub fn to_array(self) -> [f32; 4] {
1816 cast(self)
1817 }
1818
1819 #[inline]
1820 pub fn as_array(&self) -> &[f32; 4] {
1821 cast_ref(self)
1822 }
1823
1824 #[inline]
1825 pub fn as_mut_array(&mut self) -> &mut [f32; 4] {
1826 cast_mut(self)
1827 }
1828
1829 #[inline]
1830 pub fn from_i32x4(v: i32x4) -> Self {
1831 pick! {
1832 if #[cfg(target_feature="sse2")] {
1833 Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1834 } else if #[cfg(target_feature="simd128")] {
1835 Self { simd: f32x4_convert_i32x4(v.simd) }
1836 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1837 Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1838 } else {
1839 Self { arr: [
1840 v.as_array()[0] as f32,
1841 v.as_array()[1] as f32,
1842 v.as_array()[2] as f32,
1843 v.as_array()[3] as f32,
1844 ] }
1845 }
1846 }
1847 }
1848}