wide/
i16x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct i16x8 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct i16x8 { pub(crate) simd: v128 }
14
15    impl Default for i16x8 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for i16x8 {
22      fn eq(&self, other: &Self) -> bool {
23        u16x8_all_true(i16x8_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for i16x8 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct i16x8 { pub(crate) neon : int16x8_t }
33
34    impl Default for i16x8 {
35      #[inline]
36      fn default() -> Self {
37        Self::splat(0)
38      }
39    }
40
41    impl PartialEq for i16x8 {
42      #[inline]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u16(vceqq_s16(self.neon, other.neon))==u16::MAX }
45      }
46    }
47
48    impl Eq for i16x8 { }
49  } else {
50    #[derive(Default, Clone, Copy, PartialEq, Eq)]
51    #[repr(C, align(16))]
52    pub struct i16x8 { pub(crate) arr: [i16;8] }
53  }
54}
55
56int_uint_consts!(i16, 8, i16x8, 128);
57
58unsafe impl Zeroable for i16x8 {}
59unsafe impl Pod for i16x8 {}
60
61impl AlignTo for i16x8 {
62  type Elem = i16;
63}
64
65impl Add for i16x8 {
66  type Output = Self;
67  #[inline]
68  fn add(self, rhs: Self) -> Self::Output {
69    pick! {
70      if #[cfg(target_feature="sse2")] {
71        Self { sse: add_i16_m128i(self.sse, rhs.sse) }
72      } else if #[cfg(target_feature="simd128")] {
73        Self { simd: i16x8_add(self.simd, rhs.simd) }
74      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
75        unsafe { Self { neon: vaddq_s16(self.neon, rhs.neon) } }
76      } else {
77        Self { arr: [
78          self.arr[0].wrapping_add(rhs.arr[0]),
79          self.arr[1].wrapping_add(rhs.arr[1]),
80          self.arr[2].wrapping_add(rhs.arr[2]),
81          self.arr[3].wrapping_add(rhs.arr[3]),
82          self.arr[4].wrapping_add(rhs.arr[4]),
83          self.arr[5].wrapping_add(rhs.arr[5]),
84          self.arr[6].wrapping_add(rhs.arr[6]),
85          self.arr[7].wrapping_add(rhs.arr[7]),
86        ]}
87      }
88    }
89  }
90}
91
92impl Sub for i16x8 {
93  type Output = Self;
94  #[inline]
95  fn sub(self, rhs: Self) -> Self::Output {
96    pick! {
97      if #[cfg(target_feature="sse2")] {
98        Self { sse: sub_i16_m128i(self.sse, rhs.sse) }
99      } else if #[cfg(target_feature="simd128")] {
100        Self { simd: i16x8_sub(self.simd, rhs.simd) }
101      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102        unsafe {Self { neon: vsubq_s16(self.neon, rhs.neon) }}
103      } else {
104        Self { arr: [
105          self.arr[0].wrapping_sub(rhs.arr[0]),
106          self.arr[1].wrapping_sub(rhs.arr[1]),
107          self.arr[2].wrapping_sub(rhs.arr[2]),
108          self.arr[3].wrapping_sub(rhs.arr[3]),
109          self.arr[4].wrapping_sub(rhs.arr[4]),
110          self.arr[5].wrapping_sub(rhs.arr[5]),
111          self.arr[6].wrapping_sub(rhs.arr[6]),
112          self.arr[7].wrapping_sub(rhs.arr[7]),
113        ]}
114      }
115    }
116  }
117}
118
119impl Mul for i16x8 {
120  type Output = Self;
121  #[inline]
122  fn mul(self, rhs: Self) -> Self::Output {
123    pick! {
124      if #[cfg(target_feature="sse2")] {
125        Self { sse: mul_i16_keep_low_m128i(self.sse, rhs.sse) }
126      } else if #[cfg(target_feature="simd128")] {
127        Self { simd: i16x8_mul(self.simd, rhs.simd) }
128      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
129        unsafe {Self { neon: vmulq_s16(self.neon, rhs.neon) }}
130      } else {
131        Self { arr: [
132          self.arr[0].wrapping_mul(rhs.arr[0]),
133          self.arr[1].wrapping_mul(rhs.arr[1]),
134          self.arr[2].wrapping_mul(rhs.arr[2]),
135          self.arr[3].wrapping_mul(rhs.arr[3]),
136          self.arr[4].wrapping_mul(rhs.arr[4]),
137          self.arr[5].wrapping_mul(rhs.arr[5]),
138          self.arr[6].wrapping_mul(rhs.arr[6]),
139          self.arr[7].wrapping_mul(rhs.arr[7]),
140        ]}
141      }
142    }
143  }
144}
145
146impl Add<i16> for i16x8 {
147  type Output = Self;
148  #[inline]
149  fn add(self, rhs: i16) -> Self::Output {
150    self.add(Self::splat(rhs))
151  }
152}
153
154impl Sub<i16> for i16x8 {
155  type Output = Self;
156  #[inline]
157  fn sub(self, rhs: i16) -> Self::Output {
158    self.sub(Self::splat(rhs))
159  }
160}
161
162impl Mul<i16> for i16x8 {
163  type Output = Self;
164  #[inline]
165  fn mul(self, rhs: i16) -> Self::Output {
166    self.mul(Self::splat(rhs))
167  }
168}
169
170impl Add<i16x8> for i16 {
171  type Output = i16x8;
172  #[inline]
173  fn add(self, rhs: i16x8) -> Self::Output {
174    i16x8::splat(self).add(rhs)
175  }
176}
177
178impl Sub<i16x8> for i16 {
179  type Output = i16x8;
180  #[inline]
181  fn sub(self, rhs: i16x8) -> Self::Output {
182    i16x8::splat(self).sub(rhs)
183  }
184}
185
186impl Mul<i16x8> for i16 {
187  type Output = i16x8;
188  #[inline]
189  fn mul(self, rhs: i16x8) -> Self::Output {
190    i16x8::splat(self).mul(rhs)
191  }
192}
193
194impl BitAnd for i16x8 {
195  type Output = Self;
196  #[inline]
197  fn bitand(self, rhs: Self) -> Self::Output {
198    pick! {
199      if #[cfg(target_feature="sse2")] {
200        Self { sse: bitand_m128i(self.sse, rhs.sse) }
201      } else if #[cfg(target_feature="simd128")] {
202        Self { simd: v128_and(self.simd, rhs.simd) }
203      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
204        unsafe {Self { neon: vandq_s16(self.neon, rhs.neon) }}
205      } else {
206        Self { arr: [
207          self.arr[0].bitand(rhs.arr[0]),
208          self.arr[1].bitand(rhs.arr[1]),
209          self.arr[2].bitand(rhs.arr[2]),
210          self.arr[3].bitand(rhs.arr[3]),
211          self.arr[4].bitand(rhs.arr[4]),
212          self.arr[5].bitand(rhs.arr[5]),
213          self.arr[6].bitand(rhs.arr[6]),
214          self.arr[7].bitand(rhs.arr[7]),
215        ]}
216      }
217    }
218  }
219}
220
221impl BitOr for i16x8 {
222  type Output = Self;
223  #[inline]
224  fn bitor(self, rhs: Self) -> Self::Output {
225    pick! {
226      if #[cfg(target_feature="sse2")] {
227        Self { sse: bitor_m128i(self.sse, rhs.sse) }
228      } else if #[cfg(target_feature="simd128")] {
229        Self { simd: v128_or(self.simd, rhs.simd) }
230      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
231        unsafe {Self { neon: vorrq_s16(self.neon, rhs.neon) }}
232      } else {
233        Self { arr: [
234          self.arr[0].bitor(rhs.arr[0]),
235          self.arr[1].bitor(rhs.arr[1]),
236          self.arr[2].bitor(rhs.arr[2]),
237          self.arr[3].bitor(rhs.arr[3]),
238          self.arr[4].bitor(rhs.arr[4]),
239          self.arr[5].bitor(rhs.arr[5]),
240          self.arr[6].bitor(rhs.arr[6]),
241          self.arr[7].bitor(rhs.arr[7]),
242        ]}
243      }
244    }
245  }
246}
247
248impl BitXor for i16x8 {
249  type Output = Self;
250  #[inline]
251  fn bitxor(self, rhs: Self) -> Self::Output {
252    pick! {
253      if #[cfg(target_feature="sse2")] {
254        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
255      } else if #[cfg(target_feature="simd128")] {
256        Self { simd: v128_xor(self.simd, rhs.simd) }
257      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
258        unsafe {Self { neon: veorq_s16(self.neon, rhs.neon) }}
259      } else {
260        Self { arr: [
261          self.arr[0].bitxor(rhs.arr[0]),
262          self.arr[1].bitxor(rhs.arr[1]),
263          self.arr[2].bitxor(rhs.arr[2]),
264          self.arr[3].bitxor(rhs.arr[3]),
265          self.arr[4].bitxor(rhs.arr[4]),
266          self.arr[5].bitxor(rhs.arr[5]),
267          self.arr[6].bitxor(rhs.arr[6]),
268          self.arr[7].bitxor(rhs.arr[7]),
269        ]}
270      }
271    }
272  }
273}
274
275macro_rules! impl_shl_t_for_i16x8 {
276  ($($shift_type:ty),+ $(,)?) => {
277    $(impl Shl<$shift_type> for i16x8 {
278      type Output = Self;
279      /// Shifts all lanes by the value given.
280      #[inline]
281      fn shl(self, rhs: $shift_type) -> Self::Output {
282        pick! {
283          if #[cfg(target_feature="sse2")] {
284            let shift = cast([rhs as u64, 0]);
285            Self { sse: shl_all_u16_m128i(self.sse, shift) }
286          } else if #[cfg(target_feature="simd128")] {
287            Self { simd: i16x8_shl(self.simd, rhs as u32) }
288          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
289            unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16(rhs as i16)) }}
290          } else {
291            let u = rhs as u32;
292            Self { arr: [
293              self.arr[0].wrapping_shl(u),
294              self.arr[1].wrapping_shl(u),
295              self.arr[2].wrapping_shl(u),
296              self.arr[3].wrapping_shl(u),
297              self.arr[4].wrapping_shl(u),
298              self.arr[5].wrapping_shl(u),
299              self.arr[6].wrapping_shl(u),
300              self.arr[7].wrapping_shl(u),
301            ]}
302          }
303        }
304      }
305    })+
306  };
307}
308impl_shl_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
309
310macro_rules! impl_shr_t_for_i16x8 {
311  ($($shift_type:ty),+ $(,)?) => {
312    $(impl Shr<$shift_type> for i16x8 {
313      type Output = Self;
314      /// Shifts all lanes by the value given.
315      #[inline]
316      fn shr(self, rhs: $shift_type) -> Self::Output {
317        pick! {
318          if #[cfg(target_feature="sse2")] {
319            let shift = cast([rhs as u64, 0]);
320            Self { sse: shr_all_i16_m128i(self.sse, shift) }
321          } else if #[cfg(target_feature="simd128")] {
322            Self { simd: i16x8_shr(self.simd, rhs as u32) }
323          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
324            unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16( -(rhs as i16))) }}
325          } else {
326            let u = rhs as u32;
327            Self { arr: [
328              self.arr[0].wrapping_shr(u),
329              self.arr[1].wrapping_shr(u),
330              self.arr[2].wrapping_shr(u),
331              self.arr[3].wrapping_shr(u),
332              self.arr[4].wrapping_shr(u),
333              self.arr[5].wrapping_shr(u),
334              self.arr[6].wrapping_shr(u),
335              self.arr[7].wrapping_shr(u),
336            ]}
337          }
338        }
339      }
340    })+
341  };
342}
343impl_shr_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
344
345impl CmpEq for i16x8 {
346  type Output = Self;
347  #[inline]
348  fn simd_eq(self, rhs: Self) -> Self::Output {
349    pick! {
350      if #[cfg(target_feature="sse2")] {
351        Self { sse: cmp_eq_mask_i16_m128i(self.sse, rhs.sse) }
352      } else if #[cfg(target_feature="simd128")] {
353        Self { simd: i16x8_eq(self.simd, rhs.simd) }
354      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
355        unsafe {Self { neon: vreinterpretq_s16_u16(vceqq_s16(self.neon, rhs.neon)) }}
356      } else {
357        Self { arr: [
358          if self.arr[0] == rhs.arr[0] { -1 } else { 0 },
359          if self.arr[1] == rhs.arr[1] { -1 } else { 0 },
360          if self.arr[2] == rhs.arr[2] { -1 } else { 0 },
361          if self.arr[3] == rhs.arr[3] { -1 } else { 0 },
362          if self.arr[4] == rhs.arr[4] { -1 } else { 0 },
363          if self.arr[5] == rhs.arr[5] { -1 } else { 0 },
364          if self.arr[6] == rhs.arr[6] { -1 } else { 0 },
365          if self.arr[7] == rhs.arr[7] { -1 } else { 0 },
366        ]}
367      }
368    }
369  }
370}
371
372impl CmpGt for i16x8 {
373  type Output = Self;
374  #[inline]
375  fn simd_gt(self, rhs: Self) -> Self::Output {
376    pick! {
377      if #[cfg(target_feature="sse2")] {
378        Self { sse: cmp_gt_mask_i16_m128i(self.sse, rhs.sse) }
379      } else if #[cfg(target_feature="simd128")] {
380        Self { simd: i16x8_gt(self.simd, rhs.simd) }
381      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
382        unsafe {Self { neon: vreinterpretq_s16_u16(vcgtq_s16(self.neon, rhs.neon)) }}
383      } else {
384        Self { arr: [
385          if self.arr[0] > rhs.arr[0] { -1 } else { 0 },
386          if self.arr[1] > rhs.arr[1] { -1 } else { 0 },
387          if self.arr[2] > rhs.arr[2] { -1 } else { 0 },
388          if self.arr[3] > rhs.arr[3] { -1 } else { 0 },
389          if self.arr[4] > rhs.arr[4] { -1 } else { 0 },
390          if self.arr[5] > rhs.arr[5] { -1 } else { 0 },
391          if self.arr[6] > rhs.arr[6] { -1 } else { 0 },
392          if self.arr[7] > rhs.arr[7] { -1 } else { 0 },
393        ]}
394      }
395    }
396  }
397}
398
399impl CmpLt for i16x8 {
400  type Output = Self;
401  #[inline]
402  fn simd_lt(self, rhs: Self) -> Self::Output {
403    pick! {
404      if #[cfg(target_feature="sse2")] {
405        Self { sse: cmp_lt_mask_i16_m128i(self.sse, rhs.sse) }
406      } else if #[cfg(target_feature="simd128")] {
407        Self { simd: i16x8_lt(self.simd, rhs.simd) }
408      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
409        unsafe {Self { neon: vreinterpretq_s16_u16(vcltq_s16(self.neon, rhs.neon)) }}
410      } else {
411        Self { arr: [
412          if self.arr[0] < rhs.arr[0] { -1 } else { 0 },
413          if self.arr[1] < rhs.arr[1] { -1 } else { 0 },
414          if self.arr[2] < rhs.arr[2] { -1 } else { 0 },
415          if self.arr[3] < rhs.arr[3] { -1 } else { 0 },
416          if self.arr[4] < rhs.arr[4] { -1 } else { 0 },
417          if self.arr[5] < rhs.arr[5] { -1 } else { 0 },
418          if self.arr[6] < rhs.arr[6] { -1 } else { 0 },
419          if self.arr[7] < rhs.arr[7] { -1 } else { 0 },
420        ]}
421      }
422    }
423  }
424}
425
426impl i16x8 {
427  #[inline]
428  #[must_use]
429  pub const fn new(array: [i16; 8]) -> Self {
430    unsafe { core::mem::transmute(array) }
431  }
432
433  #[inline]
434  #[must_use]
435  #[doc(alias("movemask", "move_mask"))]
436  pub fn to_bitmask(self) -> u32 {
437    pick! {
438      if #[cfg(target_feature="sse2")] {
439        (move_mask_i8_m128i( pack_i16_to_i8_m128i(self.sse,self.sse)) as u32) & 0xff
440      } else if #[cfg(target_feature="simd128")] {
441        i16x8_bitmask(self.simd) as u32
442      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
443        unsafe
444        {
445          // set all to 1 if top bit is set, else 0
446          let masked = vcltq_s16(self.neon, vdupq_n_s16(0));
447
448          // select the right bit out of each lane
449          let selectbit : uint16x8_t = core::mem::transmute([1u16, 2, 4, 8, 16, 32, 64, 128]);
450          let r = vandq_u16(masked, selectbit);
451
452          // horizontally add the 16-bit lanes
453          vaddvq_u16(r) as u32
454         }
455       } else {
456        ((self.arr[0] < 0) as u32) << 0 |
457        ((self.arr[1] < 0) as u32) << 1 |
458        ((self.arr[2] < 0) as u32) << 2 |
459        ((self.arr[3] < 0) as u32) << 3 |
460        ((self.arr[4] < 0) as u32) << 4 |
461        ((self.arr[5] < 0) as u32) << 5 |
462        ((self.arr[6] < 0) as u32) << 6 |
463        ((self.arr[7] < 0) as u32) << 7
464      }
465    }
466  }
467
468  #[inline]
469  #[must_use]
470  pub fn any(self) -> bool {
471    pick! {
472      if #[cfg(target_feature="sse2")] {
473        (move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0
474      } else if #[cfg(target_feature="simd128")] {
475        u16x8_bitmask(self.simd) != 0
476      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
477        unsafe {
478          vminvq_s16(self.neon) < 0
479        }
480      } else {
481        let v : [u64;2] = cast(self);
482        ((v[0] | v[1]) & 0x8000800080008000) != 0
483      }
484    }
485  }
486
487  #[inline]
488  #[must_use]
489  pub fn all(self) -> bool {
490    pick! {
491      if #[cfg(target_feature="sse2")] {
492        (move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010
493      } else if #[cfg(target_feature="simd128")] {
494        u16x8_bitmask(self.simd) == 0b11111111
495      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
496        unsafe {
497          vmaxvq_s16(self.neon) < 0
498        }
499      } else {
500        let v : [u64;2] = cast(self);
501        (v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000
502      }
503    }
504  }
505
506  #[inline]
507  #[must_use]
508  pub fn none(self) -> bool {
509    !self.any()
510  }
511
512  /// Unpack the lower half of the input and expand it to `i16` values.
513  #[inline]
514  #[must_use]
515  pub fn from_u8x16_low(u: u8x16) -> Self {
516    pick! {
517      if #[cfg(target_feature="sse2")] {
518        Self{ sse: unpack_low_i8_m128i(u.sse, m128i::zeroed()) }
519      } else {
520        let u_arr: [u8; 16] = cast(u);
521        cast([
522          u_arr[0] as u16 as i16,
523          u_arr[1] as u16 as i16,
524          u_arr[2] as u16 as i16,
525          u_arr[3] as u16 as i16,
526          u_arr[4] as u16 as i16,
527          u_arr[5] as u16 as i16,
528          u_arr[6] as u16 as i16,
529          u_arr[7] as u16 as i16,
530        ])
531      }
532    }
533  }
534
535  /// Unpack the upper half of the input and expand it to `i16` values.
536  #[inline]
537  #[must_use]
538  pub fn from_u8x16_high(u: u8x16) -> Self {
539    pick! {
540      if #[cfg(target_feature="sse2")] {
541        Self{ sse: unpack_high_i8_m128i(u.sse, m128i::zeroed()) }
542      } else {
543        let u_arr: [u8; 16] = cast(u);
544        cast([
545          u_arr[8] as u16 as i16,
546          u_arr[9] as u16 as i16,
547          u_arr[10] as u16 as i16,
548          u_arr[11] as u16 as i16,
549          u_arr[12] as u16 as i16,
550          u_arr[13] as u16 as i16,
551          u_arr[14] as u16 as i16,
552          u_arr[15] as u16 as i16,
553        ])
554      }
555    }
556  }
557
558  /// returns low `i16` of `i32`, saturating values that are too large
559  #[inline]
560  #[must_use]
561  pub fn from_i32x8_saturate(v: i32x8) -> Self {
562    pick! {
563      if #[cfg(target_feature="avx2")] {
564        i16x8 { sse: pack_i32_to_i16_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2))  }
565      } else if #[cfg(target_feature="sse2")] {
566        i16x8 { sse: pack_i32_to_i16_m128i( v.a.sse, v.b.sse ) }
567      } else if #[cfg(target_feature="simd128")] {
568        use core::arch::wasm32::*;
569
570        i16x8 { simd: i16x8_narrow_i32x4(v.a.simd, v.b.simd) }
571      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
572        use core::arch::aarch64::*;
573
574        unsafe {
575          i16x8 { neon: vcombine_s16(vqmovn_s32(v.a.neon), vqmovn_s32(v.b.neon)) }
576        }
577      } else {
578        fn clamp(a : i32) -> i16 {
579            if a < i16::MIN as i32 {
580                i16::MIN
581            }
582            else if a > i16::MAX as i32 {
583                i16::MAX
584            } else {
585                a as i16
586            }
587        }
588
589        i16x8::new([
590          clamp(v.as_array()[0]),
591          clamp(v.as_array()[1]),
592          clamp(v.as_array()[2]),
593          clamp(v.as_array()[3]),
594          clamp(v.as_array()[4]),
595          clamp(v.as_array()[5]),
596          clamp(v.as_array()[6]),
597          clamp(v.as_array()[7]),
598        ])
599      }
600    }
601  }
602
603  /// returns low `i16` of `i32`, truncating the upper bits if they are set
604  #[inline]
605  #[must_use]
606  pub fn from_i32x8_truncate(v: i32x8) -> Self {
607    pick! {
608      if #[cfg(target_feature="avx2")] {
609        let a = v.avx2.bitand(set_splat_i32_m256i(0xffff));
610        i16x8 { sse: pack_i32_to_u16_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a) ) }
611      } else if #[cfg(target_feature="sse2")] {
612        let a = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.a.sse));
613        let b = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.b.sse));
614
615        i16x8 { sse: pack_i32_to_i16_m128i( a, b)  }
616      } else {
617      i16x8::new([
618        v.as_array()[0] as i16,
619        v.as_array()[1] as i16,
620        v.as_array()[2] as i16,
621        v.as_array()[3] as i16,
622        v.as_array()[4] as i16,
623        v.as_array()[5] as i16,
624        v.as_array()[6] as i16,
625        v.as_array()[7] as i16,
626      ])
627      }
628    }
629  }
630
631  #[inline]
632  #[must_use]
633  pub fn from_slice_unaligned(input: &[i16]) -> Self {
634    assert!(input.len() >= 8);
635
636    pick! {
637      if #[cfg(target_feature="sse2")] {
638        unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } }
639      } else if #[cfg(target_feature="simd128")] {
640        unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } }
641      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
642        unsafe { Self { neon: vld1q_s16( input.as_ptr() as *const i16 ) } }
643      } else {
644        // 2018 edition doesn't have try_into
645        unsafe { Self::new( *(input.as_ptr() as * const [i16;8]) ) }
646      }
647    }
648  }
649
650  #[inline]
651  #[must_use]
652  pub fn blend(self, t: Self, f: Self) -> Self {
653    pick! {
654      if #[cfg(target_feature="sse4.1")] {
655        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
656      } else if #[cfg(target_feature="simd128")] {
657        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
658      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
659        unsafe {Self { neon: vbslq_s16(vreinterpretq_u16_s16(self.neon), t.neon, f.neon) }}
660      } else {
661        generic_bit_blend(self, t, f)
662      }
663    }
664  }
665  #[inline]
666  #[must_use]
667  pub fn is_negative(self) -> Self {
668    self.simd_lt(Self::zeroed())
669  }
670
671  /// horizontal add of all the elements of the vector
672  #[inline]
673  #[must_use]
674  pub fn reduce_add(self) -> i16 {
675    pick! {
676      if #[cfg(target_feature="sse2")] {
677        // there is a horizontal add instruction on ssse3, but apparently it is very slow on some AMD CPUs
678        let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
679        let sum64 = add_i16_m128i(self.sse, hi64);
680        let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
681        let sum32 = add_i16_m128i(sum64, hi32);
682        let lo16 = shr_imm_u32_m128i::<16>(sum32);
683        let sum16 = add_i16_m128i(sum32, lo16);
684        extract_i16_as_i32_m128i::<0>(sum16) as i16
685      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
686        unsafe { vaddvq_s16(self.neon) }
687      } else {
688        let arr: [i16; 8] = cast(self);
689
690        // most boring implementation possible so optimizer doesn't overthink this
691        let mut r = arr[0];
692        r = r.wrapping_add(arr[1]);
693        r = r.wrapping_add(arr[2]);
694        r = r.wrapping_add(arr[3]);
695        r = r.wrapping_add(arr[4]);
696        r = r.wrapping_add(arr[5]);
697        r = r.wrapping_add(arr[6]);
698        r.wrapping_add(arr[7])
699      }
700    }
701  }
702
703  /// horizontal min of all the elements of the vector
704  #[inline]
705  #[must_use]
706  pub fn reduce_min(self) -> i16 {
707    pick! {
708        if #[cfg(target_feature="sse2")] {
709          let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
710          let sum64 = min_i16_m128i(self.sse, hi64);
711          let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
712          let sum32 = min_i16_m128i(sum64, hi32);
713          let lo16 = shr_imm_u32_m128i::<16>(sum32);
714          let sum16 = min_i16_m128i(sum32, lo16);
715          extract_i16_as_i32_m128i::<0>(sum16) as i16
716        } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
717          unsafe { vminvq_s16(self.neon) }
718        } else {
719        let arr: [i16; 8] = cast(self);
720
721        // most boring implementation possible so optimizer doesn't overthink this
722        let mut r = arr[0];
723        r = r.min(arr[1]);
724        r = r.min(arr[2]);
725        r = r.min(arr[3]);
726        r = r.min(arr[4]);
727        r = r.min(arr[5]);
728        r = r.min(arr[6]);
729        r.min(arr[7])
730      }
731    }
732  }
733
734  /// horizontal max of all the elements of the vector
735  #[inline]
736  #[must_use]
737  pub fn reduce_max(self) -> i16 {
738    pick! {
739        if #[cfg(target_feature="sse2")] {
740          let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
741          let sum64 = max_i16_m128i(self.sse, hi64);
742          let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
743          let sum32 = max_i16_m128i(sum64, hi32);
744          let lo16 = shr_imm_u32_m128i::<16>(sum32);
745          let sum16 = max_i16_m128i(sum32, lo16);
746          extract_i16_as_i32_m128i::<0>(sum16) as i16
747        } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
748          unsafe { vmaxvq_s16(self.neon) }
749        } else {
750        let arr: [i16; 8] = cast(self);
751
752        // most boring implementation possible so optimizer doesn't overthink this
753        let mut r = arr[0];
754        r = r.max(arr[1]);
755        r = r.max(arr[2]);
756        r = r.max(arr[3]);
757        r = r.max(arr[4]);
758        r = r.max(arr[5]);
759        r = r.max(arr[6]);
760        r.max(arr[7])
761      }
762    }
763  }
764
765  #[inline]
766  #[must_use]
767  pub fn abs(self) -> Self {
768    pick! {
769      if #[cfg(target_feature="sse2")] {
770        let mask = shr_imm_i16_m128i::<15>(self.sse);
771        Self { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
772      } else if #[cfg(target_feature="ssse3")] {
773        Self { sse: abs_i16_m128i(self.sse) }
774      } else if #[cfg(target_feature="simd128")] {
775        Self { simd: i16x8_abs(self.simd) }
776      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
777        unsafe {Self { neon: vabsq_s16(self.neon) }}
778      } else {
779        let arr: [i16; 8] = cast(self);
780        cast(
781          [
782            arr[0].wrapping_abs(),
783            arr[1].wrapping_abs(),
784            arr[2].wrapping_abs(),
785            arr[3].wrapping_abs(),
786            arr[4].wrapping_abs(),
787            arr[5].wrapping_abs(),
788            arr[6].wrapping_abs(),
789            arr[7].wrapping_abs(),
790          ])
791      }
792    }
793  }
794
795  #[inline]
796  #[must_use]
797  pub fn unsigned_abs(self) -> u16x8 {
798    pick! {
799      if #[cfg(target_feature="sse2")] {
800        let mask = shr_imm_i16_m128i::<15>(self.sse);
801        u16x8 { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
802      } else if #[cfg(target_feature="ssse3")] {
803        u16x8 { sse: abs_i16_m128i(self.sse) }
804      } else if #[cfg(target_feature="simd128")] {
805        u16x8 { simd: i16x8_abs(self.simd) }
806      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
807        unsafe {u16x8 { neon: vreinterpretq_u16_s16(vabsq_s16(self.neon)) }}
808      } else {
809        let arr: [i16; 8] = cast(self);
810        cast(
811          [
812            arr[0].unsigned_abs(),
813            arr[1].unsigned_abs(),
814            arr[2].unsigned_abs(),
815            arr[3].unsigned_abs(),
816            arr[4].unsigned_abs(),
817            arr[5].unsigned_abs(),
818            arr[6].unsigned_abs(),
819            arr[7].unsigned_abs(),
820          ])
821      }
822    }
823  }
824
825  #[inline]
826  #[must_use]
827  pub fn max(self, rhs: Self) -> Self {
828    pick! {
829      if #[cfg(target_feature="sse2")] {
830        Self { sse: max_i16_m128i(self.sse, rhs.sse) }
831      } else if #[cfg(target_feature="simd128")] {
832        Self { simd: i16x8_max(self.simd, rhs.simd) }
833      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
834        unsafe {Self { neon: vmaxq_s16(self.neon, rhs.neon) }}
835      } else {
836        self.simd_lt(rhs).blend(rhs, self)
837      }
838    }
839  }
840  #[inline]
841  #[must_use]
842  pub fn min(self, rhs: Self) -> Self {
843    pick! {
844      if #[cfg(target_feature="sse2")] {
845        Self { sse: min_i16_m128i(self.sse, rhs.sse) }
846      } else if #[cfg(target_feature="simd128")] {
847        Self { simd: i16x8_min(self.simd, rhs.simd) }
848      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
849        unsafe {Self { neon: vminq_s16(self.neon, rhs.neon) }}
850      } else {
851        self.simd_lt(rhs).blend(self, rhs)
852      }
853    }
854  }
855
856  #[inline]
857  #[must_use]
858  pub fn saturating_add(self, rhs: Self) -> Self {
859    pick! {
860      if #[cfg(target_feature="sse2")] {
861        Self { sse: add_saturating_i16_m128i(self.sse, rhs.sse) }
862      } else if #[cfg(target_feature="simd128")] {
863        Self { simd: i16x8_add_sat(self.simd, rhs.simd) }
864      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
865        unsafe {Self { neon: vqaddq_s16(self.neon, rhs.neon) }}
866      } else {
867        Self { arr: [
868          self.arr[0].saturating_add(rhs.arr[0]),
869          self.arr[1].saturating_add(rhs.arr[1]),
870          self.arr[2].saturating_add(rhs.arr[2]),
871          self.arr[3].saturating_add(rhs.arr[3]),
872          self.arr[4].saturating_add(rhs.arr[4]),
873          self.arr[5].saturating_add(rhs.arr[5]),
874          self.arr[6].saturating_add(rhs.arr[6]),
875          self.arr[7].saturating_add(rhs.arr[7]),
876        ]}
877      }
878    }
879  }
880  #[inline]
881  #[must_use]
882  pub fn saturating_sub(self, rhs: Self) -> Self {
883    pick! {
884      if #[cfg(target_feature="sse2")] {
885        Self { sse: sub_saturating_i16_m128i(self.sse, rhs.sse) }
886      } else if #[cfg(target_feature="simd128")] {
887        Self { simd: i16x8_sub_sat(self.simd, rhs.simd) }
888      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
889        unsafe { Self { neon: vqsubq_s16(self.neon, rhs.neon) } }
890      } else {
891        Self { arr: [
892          self.arr[0].saturating_sub(rhs.arr[0]),
893          self.arr[1].saturating_sub(rhs.arr[1]),
894          self.arr[2].saturating_sub(rhs.arr[2]),
895          self.arr[3].saturating_sub(rhs.arr[3]),
896          self.arr[4].saturating_sub(rhs.arr[4]),
897          self.arr[5].saturating_sub(rhs.arr[5]),
898          self.arr[6].saturating_sub(rhs.arr[6]),
899          self.arr[7].saturating_sub(rhs.arr[7]),
900        ]}
901      }
902    }
903  }
904
905  /// Calculates partial dot product.
906  /// Multiplies packed signed 16-bit integers, producing intermediate signed
907  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
908  /// integers.
909  #[inline]
910  #[must_use]
911  pub fn dot(self, rhs: Self) -> i32x4 {
912    pick! {
913      if #[cfg(target_feature="sse2")] {
914        i32x4 { sse:  mul_i16_horizontal_add_m128i(self.sse, rhs.sse) }
915      } else if #[cfg(target_feature="simd128")] {
916        i32x4 { simd: i32x4_dot_i16x8(self.simd, rhs.simd) }
917      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
918        unsafe {
919          let pl = vmull_s16(vget_low_s16(self.neon),  vget_low_s16(rhs.neon));
920          let ph = vmull_high_s16(self.neon, rhs.neon);
921          i32x4 { neon: vpaddq_s32(pl, ph) }
922        }
923      } else {
924        i32x4 { arr: [
925          (i32::from(self.arr[0]) * i32::from(rhs.arr[0])) + (i32::from(self.arr[1]) * i32::from(rhs.arr[1])),
926          (i32::from(self.arr[2]) * i32::from(rhs.arr[2])) + (i32::from(self.arr[3]) * i32::from(rhs.arr[3])),
927          (i32::from(self.arr[4]) * i32::from(rhs.arr[4])) + (i32::from(self.arr[5]) * i32::from(rhs.arr[5])),
928          (i32::from(self.arr[6]) * i32::from(rhs.arr[6])) + (i32::from(self.arr[7]) * i32::from(rhs.arr[7])),
929        ] }
930      }
931    }
932  }
933
934  /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each
935  /// lane, effectively multiplying by a 16 bit fixed point number between `-1`
936  /// and `1`. This corresponds to the following instructions:
937  /// - `vqrdmulhq_s16` instruction on neon
938  /// - `i16x8_q15mulr_sat` on simd128
939  /// - `_mm_mulhrs_epi16` on ssse3
940  /// - emulated via `mul_i16_*` on sse2
941  #[inline]
942  #[must_use]
943  pub fn mul_scale_round(self, rhs: Self) -> Self {
944    pick! {
945      if #[cfg(target_feature="ssse3")] {
946        Self { sse:  mul_i16_scale_round_m128i(self.sse, rhs.sse) }
947      } else if #[cfg(target_feature="sse2")] {
948        // unfortunately mul_i16_scale_round_m128i only got added in sse3
949        let hi = mul_i16_keep_high_m128i(self.sse, rhs.sse);
950        let lo = mul_i16_keep_low_m128i(self.sse, rhs.sse);
951        let mut v1 = unpack_low_i16_m128i(lo, hi);
952        let mut v2 = unpack_high_i16_m128i(lo, hi);
953        let a = set_splat_i32_m128i(0x4000);
954        v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
955        v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
956        let s = pack_i32_to_i16_m128i(v1, v2);
957        Self { sse: s }
958      } else if #[cfg(target_feature="simd128")] {
959        Self { simd: i16x8_q15mulr_sat(self.simd, rhs.simd) }
960      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
961        unsafe { Self { neon: vqrdmulhq_s16(self.neon, rhs.neon) } }
962      } else {
963        // compiler does a surprisingly good job of vectorizing this
964        Self { arr: [
965          ((i32::from(self.arr[0]) * i32::from(rhs.arr[0]) + 0x4000) >> 15) as i16,
966          ((i32::from(self.arr[1]) * i32::from(rhs.arr[1]) + 0x4000) >> 15) as i16,
967          ((i32::from(self.arr[2]) * i32::from(rhs.arr[2]) + 0x4000) >> 15) as i16,
968          ((i32::from(self.arr[3]) * i32::from(rhs.arr[3]) + 0x4000) >> 15) as i16,
969          ((i32::from(self.arr[4]) * i32::from(rhs.arr[4]) + 0x4000) >> 15) as i16,
970          ((i32::from(self.arr[5]) * i32::from(rhs.arr[5]) + 0x4000) >> 15) as i16,
971          ((i32::from(self.arr[6]) * i32::from(rhs.arr[6]) + 0x4000) >> 15) as i16,
972          ((i32::from(self.arr[7]) * i32::from(rhs.arr[7]) + 0x4000) >> 15) as i16,
973        ]}
974      }
975    }
976  }
977
978  /// Multiples two `i16x8` and return the high part of intermediate `i32x8`
979  #[inline]
980  #[must_use]
981  pub fn mul_keep_high(lhs: Self, rhs: Self) -> Self {
982    pick! {
983      if #[cfg(target_feature="sse2")] {
984        Self { sse: mul_i16_keep_high_m128i(lhs.sse, rhs.sse) }
985      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
986        let lhs_low = unsafe { vget_low_s16(lhs.neon) };
987        let rhs_low = unsafe { vget_low_s16(rhs.neon) };
988
989        let lhs_high = unsafe { vget_high_s16(lhs.neon) };
990        let rhs_high = unsafe { vget_high_s16(rhs.neon) };
991
992        let low = unsafe { vmull_s16(lhs_low, rhs_low) };
993        let high = unsafe { vmull_s16(lhs_high, rhs_high) };
994
995        i16x8 { neon: unsafe { vreinterpretq_s16_u16(vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).1) } }
996      } else if #[cfg(target_feature="simd128")] {
997        let low =  i32x4_extmul_low_i16x8(lhs.simd, rhs.simd);
998        let high = i32x4_extmul_high_i16x8(lhs.simd, rhs.simd);
999
1000        Self { simd: i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) }
1001      } else {
1002        i16x8::new([
1003          ((i32::from(rhs.as_array()[0]) * i32::from(lhs.as_array()[0])) >> 16) as i16,
1004          ((i32::from(rhs.as_array()[1]) * i32::from(lhs.as_array()[1])) >> 16) as i16,
1005          ((i32::from(rhs.as_array()[2]) * i32::from(lhs.as_array()[2])) >> 16) as i16,
1006          ((i32::from(rhs.as_array()[3]) * i32::from(lhs.as_array()[3])) >> 16) as i16,
1007          ((i32::from(rhs.as_array()[4]) * i32::from(lhs.as_array()[4])) >> 16) as i16,
1008          ((i32::from(rhs.as_array()[5]) * i32::from(lhs.as_array()[5])) >> 16) as i16,
1009          ((i32::from(rhs.as_array()[6]) * i32::from(lhs.as_array()[6])) >> 16) as i16,
1010          ((i32::from(rhs.as_array()[7]) * i32::from(lhs.as_array()[7])) >> 16) as i16,
1011        ])
1012      }
1013    }
1014  }
1015
1016  /// multiplies two `i16x8` and returns the result as a widened `i32x8`
1017  #[inline]
1018  #[must_use]
1019  pub fn mul_widen(self, rhs: Self) -> i32x8 {
1020    pick! {
1021      if #[cfg(target_feature="avx2")] {
1022        let a = convert_to_i32_m256i_from_i16_m128i(self.sse);
1023        let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse);
1024        i32x8 { avx2: mul_i32_keep_low_m256i(a,b) }
1025      } else if #[cfg(target_feature="sse2")] {
1026         let low = mul_i16_keep_low_m128i(self.sse, rhs.sse);
1027         let high = mul_i16_keep_high_m128i(self.sse, rhs.sse);
1028         i32x8 {
1029          a: i32x4 { sse:unpack_low_i16_m128i(low, high) },
1030          b: i32x4 { sse:unpack_high_i16_m128i(low, high) }
1031        }
1032      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1033         let lhs_low = unsafe { vget_low_s16(self.neon) };
1034         let rhs_low = unsafe { vget_low_s16(rhs.neon) };
1035
1036         let lhs_high = unsafe { vget_high_s16(self.neon) };
1037         let rhs_high = unsafe { vget_high_s16(rhs.neon) };
1038
1039         let low = unsafe { vmull_s16(lhs_low, rhs_low) };
1040         let high = unsafe { vmull_s16(lhs_high, rhs_high) };
1041
1042         i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } }
1043       } else {
1044        let a = self.as_array();
1045        let b = rhs.as_array();
1046         i32x8::new([
1047           i32::from(a[0]) * i32::from(b[0]),
1048           i32::from(a[1]) * i32::from(b[1]),
1049           i32::from(a[2]) * i32::from(b[2]),
1050           i32::from(a[3]) * i32::from(b[3]),
1051           i32::from(a[4]) * i32::from(b[4]),
1052           i32::from(a[5]) * i32::from(b[5]),
1053           i32::from(a[6]) * i32::from(b[6]),
1054           i32::from(a[7]) * i32::from(b[7]),
1055         ])
1056       }
1057    }
1058  }
1059
1060  /// transpose matrix of 8x8 i16 matrix
1061  #[must_use]
1062  #[inline]
1063  pub fn transpose(data: [i16x8; 8]) -> [i16x8; 8] {
1064    pick! {
1065      if #[cfg(target_feature="sse2")] {
1066        let a1 = unpack_low_i16_m128i(data[0].sse, data[1].sse);
1067        let a2 = unpack_high_i16_m128i(data[0].sse, data[1].sse);
1068        let a3 = unpack_low_i16_m128i(data[2].sse, data[3].sse);
1069        let a4 = unpack_high_i16_m128i(data[2].sse, data[3].sse);
1070        let a5 = unpack_low_i16_m128i(data[4].sse, data[5].sse);
1071        let a6 = unpack_high_i16_m128i(data[4].sse, data[5].sse);
1072        let a7 = unpack_low_i16_m128i(data[6].sse, data[7].sse);
1073        let a8 = unpack_high_i16_m128i(data[6].sse, data[7].sse);
1074
1075        let b1 = unpack_low_i32_m128i(a1, a3);
1076        let b2 = unpack_high_i32_m128i(a1, a3);
1077        let b3 = unpack_low_i32_m128i(a2, a4);
1078        let b4 = unpack_high_i32_m128i(a2, a4);
1079        let b5 = unpack_low_i32_m128i(a5, a7);
1080        let b6 = unpack_high_i32_m128i(a5, a7);
1081        let b7 = unpack_low_i32_m128i(a6, a8);
1082        let b8 = unpack_high_i32_m128i(a6, a8);
1083
1084        [
1085          i16x8 { sse: unpack_low_i64_m128i(b1, b5) },
1086          i16x8 { sse: unpack_high_i64_m128i(b1, b5) },
1087          i16x8 { sse: unpack_low_i64_m128i(b2, b6) },
1088          i16x8 { sse: unpack_high_i64_m128i(b2, b6) },
1089          i16x8 { sse: unpack_low_i64_m128i(b3, b7) },
1090          i16x8 { sse: unpack_high_i64_m128i(b3, b7) },
1091          i16x8 { sse: unpack_low_i64_m128i(b4, b8) },
1092          i16x8 { sse: unpack_high_i64_m128i(b4, b8) } ,
1093        ]
1094     } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1095
1096          #[inline] fn vtrq32(a : int16x8_t, b : int16x8_t) -> (int16x8_t, int16x8_t)
1097          {
1098              unsafe {
1099                let r = vtrnq_s32(vreinterpretq_s32_s16(a),vreinterpretq_s32_s16(b));
1100                (vreinterpretq_s16_s32(r.0), vreinterpretq_s16_s32(r.1))
1101              }
1102          }
1103
1104        unsafe {
1105          let (q0,q2) = vtrq32(data[0].neon, data[2].neon);
1106          let (q1,q3) = vtrq32(data[1].neon, data[3].neon);
1107          let (q4,q6) = vtrq32(data[4].neon, data[6].neon);
1108          let (q5,q7) = vtrq32(data[5].neon, data[7].neon);
1109
1110          let b1 = vtrnq_s16(q0, q1);
1111          let b2 = vtrnq_s16(q2, q3);
1112          let b3 = vtrnq_s16(q4, q5);
1113          let b4 = vtrnq_s16(q6, q7);
1114
1115          // There is no vtrnq_s64 unfortunately, so there's this mess
1116          // which does a somewhat reasonable job, but not as good as the
1117          // assembly versions which just swap the 64 bit register aliases.
1118          [
1119            i16x8 { neon: vcombine_s16(vget_low_s16(b1.0), vget_low_s16(b3.0)) },
1120            i16x8 { neon: vcombine_s16(vget_low_s16(b1.1), vget_low_s16(b3.1)) },
1121            i16x8 { neon: vcombine_s16(vget_low_s16(b2.0), vget_low_s16(b4.0)) },
1122            i16x8 { neon: vcombine_s16(vget_low_s16(b2.1), vget_low_s16(b4.1)) },
1123            i16x8 { neon: vcombine_s16(vget_high_s16(b1.0), vget_high_s16(b3.0)) },
1124            i16x8 { neon: vcombine_s16(vget_high_s16(b1.1), vget_high_s16(b3.1)) },
1125            i16x8 { neon: vcombine_s16(vget_high_s16(b2.0), vget_high_s16(b4.0)) },
1126            i16x8 { neon: vcombine_s16(vget_high_s16(b2.1), vget_high_s16(b4.1)) },
1127          ]
1128        }
1129      } else if #[cfg(target_feature="simd128")] {
1130        #[inline] fn lo_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a,b) }
1131        #[inline] fn hi_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a,b) }
1132        #[inline] fn lo_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a,b) }
1133        #[inline] fn hi_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a,b) }
1134        #[inline] fn lo_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<0, 2>(a,b) }
1135        #[inline] fn hi_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<1, 3>(a,b) }
1136
1137        let a1 = lo_i16(data[0].simd, data[1].simd);
1138        let a2 = hi_i16(data[0].simd, data[1].simd);
1139        let a3 = lo_i16(data[2].simd, data[3].simd);
1140        let a4 = hi_i16(data[2].simd, data[3].simd);
1141        let a5 = lo_i16(data[4].simd, data[5].simd);
1142        let a6 = hi_i16(data[4].simd, data[5].simd);
1143        let a7 = lo_i16(data[6].simd, data[7].simd);
1144        let a8 = hi_i16(data[6].simd, data[7].simd);
1145
1146        let b1 = lo_i32(a1, a3);
1147        let b2 = hi_i32(a1, a3);
1148        let b3 = lo_i32(a2, a4);
1149        let b4 = hi_i32(a2, a4);
1150        let b5 = lo_i32(a5, a7);
1151        let b6 = hi_i32(a5, a7);
1152        let b7 = lo_i32(a6, a8);
1153        let b8 = hi_i32(a6, a8);
1154
1155        [
1156          i16x8 { simd: lo_i64(b1, b5) },
1157          i16x8 { simd: hi_i64(b1, b5) },
1158          i16x8 { simd: lo_i64(b2, b6) },
1159          i16x8 { simd: hi_i64(b2, b6) },
1160          i16x8 { simd: lo_i64(b3, b7) },
1161          i16x8 { simd: hi_i64(b3, b7) },
1162          i16x8 { simd: lo_i64(b4, b8) },
1163          i16x8 { simd: hi_i64(b4, b8) } ,
1164        ]
1165
1166      } else {
1167        #[inline(always)]
1168        fn transpose_column(data: &[i16x8; 8], index: usize) -> i16x8 {
1169          i16x8::new([
1170            data[0].as_array()[index],
1171            data[1].as_array()[index],
1172            data[2].as_array()[index],
1173            data[3].as_array()[index],
1174            data[4].as_array()[index],
1175            data[5].as_array()[index],
1176            data[6].as_array()[index],
1177            data[7].as_array()[index],
1178          ])
1179        }
1180
1181        [
1182          transpose_column(&data, 0),
1183          transpose_column(&data, 1),
1184          transpose_column(&data, 2),
1185          transpose_column(&data, 3),
1186          transpose_column(&data, 4),
1187          transpose_column(&data, 5),
1188          transpose_column(&data, 6),
1189          transpose_column(&data, 7),
1190        ]
1191      }
1192    }
1193  }
1194
1195  #[inline]
1196  #[must_use]
1197  /// Multiply and scale, equivalent to `((self * rhs) + 0x4000) >> 15` on each
1198  /// lane, effectively multiplying by a 16 bit fixed point number between `-1`
1199  /// and `1`. This corresponds to the following instructions:
1200  /// - `vqrdmulhq_n_s16` instruction on neon
1201  /// - `i16x8_q15mulr_sat` on simd128
1202  /// - `_mm_mulhrs_epi16` on ssse3
1203  /// - emulated via `mul_i16_*` on sse2
1204  pub fn mul_scale_round_n(self, rhs: i16) -> Self {
1205    pick! {
1206      if #[cfg(target_feature="ssse3")] {
1207        Self { sse:  mul_i16_scale_round_m128i(self.sse, set_splat_i16_m128i(rhs)) }
1208      } else if #[cfg(target_feature="sse2")] {
1209        // unfortunately mul_i16_scale_round_m128i only got added in sse3
1210        let r = set_splat_i16_m128i(rhs);
1211        let hi = mul_i16_keep_high_m128i(self.sse, r);
1212        let lo = mul_i16_keep_low_m128i(self.sse, r);
1213        let mut v1 = unpack_low_i16_m128i(lo, hi);
1214        let mut v2 = unpack_high_i16_m128i(lo, hi);
1215        let a = set_splat_i32_m128i(0x4000);
1216        v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
1217        v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
1218        let s = pack_i32_to_i16_m128i(v1, v2);
1219        Self { sse: s }
1220      } else if #[cfg(target_feature="simd128")] {
1221        Self { simd: i16x8_q15mulr_sat(self.simd, i16x8_splat(rhs)) }
1222      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1223        unsafe { Self { neon: vqrdmulhq_n_s16(self.neon, rhs) } }
1224      } else {
1225        // compiler does a surprisingly good job of vectorizing this
1226        Self { arr: [
1227          ((i32::from(self.arr[0]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1228          ((i32::from(self.arr[1]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1229          ((i32::from(self.arr[2]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1230          ((i32::from(self.arr[3]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1231          ((i32::from(self.arr[4]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1232          ((i32::from(self.arr[5]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1233          ((i32::from(self.arr[6]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1234          ((i32::from(self.arr[7]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1235        ]}
1236      }
1237    }
1238  }
1239
1240  #[inline]
1241  pub fn to_array(self) -> [i16; 8] {
1242    cast(self)
1243  }
1244
1245  #[inline]
1246  pub fn as_array(&self) -> &[i16; 8] {
1247    cast_ref(self)
1248  }
1249
1250  #[inline]
1251  pub fn as_mut_array(&mut self) -> &mut [i16; 8] {
1252    cast_mut(self)
1253  }
1254}