wide/
f32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(16))]
7    pub struct f32x4 { pub(crate) sse: m128 }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct f32x4 { pub(crate) simd: v128 }
14
15    impl Default for f32x4 {
16      fn default() -> Self {
17        Self::splat(0.0)
18      }
19    }
20
21    impl PartialEq for f32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(f32x4_eq(self.simd, other.simd))
24      }
25    }
26  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27    use core::arch::aarch64::*;
28    #[repr(C)]
29    #[derive(Copy, Clone)]
30    pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32    impl Default for f32x4 {
33      #[inline]
34      #[must_use]
35      fn default() -> Self {
36        unsafe { Self { neon: vdupq_n_f32(0.0)} }
37      }
38    }
39
40    impl PartialEq for f32x4 {
41      #[inline]
42      #[must_use]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
45      }
46
47    }
48    } else {
49    #[derive(Default, Clone, Copy, PartialEq)]
50    #[repr(C, align(16))]
51    pub struct f32x4 { pub(crate) arr: [f32;4] }
52  }
53}
54
55macro_rules! const_f32_as_f32x4 {
56  ($i:ident, $f:expr) => {
57    #[allow(non_upper_case_globals)]
58    pub const $i: f32x4 =
59      unsafe { ConstUnionHack128bit { f32a4: [$f; 4] }.f32x4 };
60  };
61}
62
63impl f32x4 {
64  const_f32_as_f32x4!(ONE, 1.0);
65  const_f32_as_f32x4!(ZERO, 0.0);
66  const_f32_as_f32x4!(HALF, 0.5);
67  const_f32_as_f32x4!(E, core::f32::consts::E);
68  const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
69  const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
70  const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
71  const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
72  const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
73  const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
74  const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
75  const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
76  const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
77  const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
78  const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
79  const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
80  const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
81  const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
82  const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
83  const_f32_as_f32x4!(PI, core::f32::consts::PI);
84  const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
85  const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
86}
87
88unsafe impl Zeroable for f32x4 {}
89unsafe impl Pod for f32x4 {}
90
91impl Add for f32x4 {
92  type Output = Self;
93  #[inline]
94  #[must_use]
95  fn add(self, rhs: Self) -> Self::Output {
96    pick! {
97      if #[cfg(target_feature="sse")] {
98        Self { sse: add_m128(self.sse, rhs.sse) }
99      } else if #[cfg(target_feature="simd128")] {
100        Self { simd: f32x4_add(self.simd, rhs.simd) }
101      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102        unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
103      } else {
104        Self { arr: [
105          self.arr[0] + rhs.arr[0],
106          self.arr[1] + rhs.arr[1],
107          self.arr[2] + rhs.arr[2],
108          self.arr[3] + rhs.arr[3],
109        ]}
110      }
111    }
112  }
113}
114
115impl Sub for f32x4 {
116  type Output = Self;
117  #[inline]
118  #[must_use]
119  fn sub(self, rhs: Self) -> Self::Output {
120    pick! {
121      if #[cfg(target_feature="sse")] {
122        Self { sse: sub_m128(self.sse, rhs.sse) }
123      } else if #[cfg(target_feature="simd128")] {
124        Self { simd: f32x4_sub(self.simd, rhs.simd) }
125      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
126        unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
127      } else {
128        Self { arr: [
129          self.arr[0] - rhs.arr[0],
130          self.arr[1] - rhs.arr[1],
131          self.arr[2] - rhs.arr[2],
132          self.arr[3] - rhs.arr[3],
133        ]}
134      }
135    }
136  }
137}
138
139impl Mul for f32x4 {
140  type Output = Self;
141  #[inline]
142  #[must_use]
143  fn mul(self, rhs: Self) -> Self::Output {
144    pick! {
145      if #[cfg(target_feature="sse")] {
146        Self { sse: mul_m128(self.sse, rhs.sse) }
147      } else if #[cfg(target_feature="simd128")] {
148        Self { simd: f32x4_mul(self.simd, rhs.simd) }
149      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
150        unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
151      } else {
152        Self { arr: [
153          self.arr[0] * rhs.arr[0],
154          self.arr[1] * rhs.arr[1],
155          self.arr[2] * rhs.arr[2],
156          self.arr[3] * rhs.arr[3],
157        ]}
158      }
159    }
160  }
161}
162
163impl Div for f32x4 {
164  type Output = Self;
165  #[inline]
166  #[must_use]
167  fn div(self, rhs: Self) -> Self::Output {
168    pick! {
169      if #[cfg(target_feature="sse")] {
170        Self { sse: div_m128(self.sse, rhs.sse) }
171      } else if #[cfg(target_feature="simd128")] {
172        Self { simd: f32x4_div(self.simd, rhs.simd) }
173      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
174        unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
175      } else {
176        Self { arr: [
177          self.arr[0] / rhs.arr[0],
178          self.arr[1] / rhs.arr[1],
179          self.arr[2] / rhs.arr[2],
180          self.arr[3] / rhs.arr[3],
181        ]}
182      }
183    }
184  }
185}
186
187impl Add<f32> for f32x4 {
188  type Output = Self;
189  #[inline]
190  #[must_use]
191  fn add(self, rhs: f32) -> Self::Output {
192    self.add(Self::splat(rhs))
193  }
194}
195
196impl Sub<f32> for f32x4 {
197  type Output = Self;
198  #[inline]
199  #[must_use]
200  fn sub(self, rhs: f32) -> Self::Output {
201    self.sub(Self::splat(rhs))
202  }
203}
204
205impl Mul<f32> for f32x4 {
206  type Output = Self;
207  #[inline]
208  #[must_use]
209  fn mul(self, rhs: f32) -> Self::Output {
210    self.mul(Self::splat(rhs))
211  }
212}
213
214impl Div<f32> for f32x4 {
215  type Output = Self;
216  #[inline]
217  #[must_use]
218  fn div(self, rhs: f32) -> Self::Output {
219    self.div(Self::splat(rhs))
220  }
221}
222
223impl Add<f32x4> for f32 {
224  type Output = f32x4;
225  #[inline]
226  #[must_use]
227  fn add(self, rhs: f32x4) -> Self::Output {
228    f32x4::splat(self).add(rhs)
229  }
230}
231
232impl Sub<f32x4> for f32 {
233  type Output = f32x4;
234  #[inline]
235  #[must_use]
236  fn sub(self, rhs: f32x4) -> Self::Output {
237    f32x4::splat(self).sub(rhs)
238  }
239}
240
241impl Mul<f32x4> for f32 {
242  type Output = f32x4;
243  #[inline]
244  #[must_use]
245  fn mul(self, rhs: f32x4) -> Self::Output {
246    f32x4::splat(self).mul(rhs)
247  }
248}
249
250impl Div<f32x4> for f32 {
251  type Output = f32x4;
252  #[inline]
253  #[must_use]
254  fn div(self, rhs: f32x4) -> Self::Output {
255    f32x4::splat(self).div(rhs)
256  }
257}
258
259impl BitAnd for f32x4 {
260  type Output = Self;
261  #[inline]
262  #[must_use]
263  fn bitand(self, rhs: Self) -> Self::Output {
264    pick! {
265      if #[cfg(target_feature="sse")] {
266        Self { sse: bitand_m128(self.sse, rhs.sse) }
267      } else if #[cfg(target_feature="simd128")] {
268        Self { simd: v128_and(self.simd, rhs.simd) }
269      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
270        unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
271      } else {
272        Self { arr: [
273          f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
274          f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
275          f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
276          f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
277        ]}
278      }
279    }
280  }
281}
282
283impl BitOr for f32x4 {
284  type Output = Self;
285  #[inline]
286  #[must_use]
287  fn bitor(self, rhs: Self) -> Self::Output {
288    pick! {
289      if #[cfg(target_feature="sse")] {
290        Self { sse: bitor_m128(self.sse, rhs.sse) }
291      } else if #[cfg(target_feature="simd128")] {
292        Self { simd: v128_or(self.simd, rhs.simd) }
293      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
294        unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
295      } else {
296        Self { arr: [
297          f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
298          f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
299          f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
300          f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
301        ]}
302      }
303    }
304  }
305}
306
307impl BitXor for f32x4 {
308  type Output = Self;
309  #[inline]
310  #[must_use]
311  fn bitxor(self, rhs: Self) -> Self::Output {
312    pick! {
313      if #[cfg(target_feature="sse")] {
314        Self { sse: bitxor_m128(self.sse, rhs.sse) }
315      } else if #[cfg(target_feature="simd128")] {
316        Self { simd: v128_xor(self.simd, rhs.simd) }
317      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
318        unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
319      } else {
320        Self { arr: [
321          f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
322          f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
323          f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
324          f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
325        ]}
326      }
327    }
328  }
329}
330
331impl CmpEq for f32x4 {
332  type Output = Self;
333  #[inline]
334  #[must_use]
335  fn cmp_eq(self, rhs: Self) -> Self::Output {
336    pick! {
337      if #[cfg(target_feature="sse")] {
338        Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
339      } else if #[cfg(target_feature="simd128")] {
340        Self { simd: f32x4_eq(self.simd, rhs.simd) }
341      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
342        unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
343      } else {
344        Self { arr: [
345          if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
346          if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
347          if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
348          if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
349        ]}
350      }
351    }
352  }
353}
354
355impl CmpGe for f32x4 {
356  type Output = Self;
357  #[inline]
358  #[must_use]
359  fn cmp_ge(self, rhs: Self) -> Self::Output {
360    pick! {
361      if #[cfg(target_feature="sse")] {
362        Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
363      } else if #[cfg(target_feature="simd128")] {
364        Self { simd: f32x4_ge(self.simd, rhs.simd) }
365      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
366        unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
367      } else {
368        Self { arr: [
369          if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
370          if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
371          if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
372          if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
373        ]}
374      }
375    }
376  }
377}
378
379impl CmpGt for f32x4 {
380  type Output = Self;
381  #[inline]
382  #[must_use]
383  fn cmp_gt(self, rhs: Self) -> Self::Output {
384    pick! {
385      if #[cfg(target_feature="sse")] {
386        Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
387      } else if #[cfg(target_feature="simd128")] {
388        Self { simd: f32x4_gt(self.simd, rhs.simd) }
389      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
390        unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
391      } else {
392        Self { arr: [
393          if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
394          if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
395          if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
396          if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
397        ]}
398      }
399    }
400  }
401}
402
403impl CmpNe for f32x4 {
404  type Output = Self;
405  #[inline]
406  #[must_use]
407  fn cmp_ne(self, rhs: Self) -> Self::Output {
408    pick! {
409      if #[cfg(target_feature="sse")] {
410        Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
411      } else if #[cfg(target_feature="simd128")] {
412        Self { simd: f32x4_ne(self.simd, rhs.simd) }
413      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
414        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
415      } else {
416        Self { arr: [
417          if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
418          if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
419          if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
420          if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
421        ]}
422      }
423    }
424  }
425}
426
427impl CmpLe for f32x4 {
428  type Output = Self;
429  #[inline]
430  #[must_use]
431  fn cmp_le(self, rhs: Self) -> Self::Output {
432    pick! {
433      if #[cfg(target_feature="sse")] {
434        Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
435      } else if #[cfg(target_feature="simd128")] {
436        Self { simd: f32x4_le(self.simd, rhs.simd) }
437      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
438        unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
439      } else {
440        Self { arr: [
441          if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
442          if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
443          if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
444          if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
445        ]}
446      }
447    }
448  }
449}
450
451impl CmpLt for f32x4 {
452  type Output = Self;
453  #[inline]
454  #[must_use]
455  fn cmp_lt(self, rhs: Self) -> Self::Output {
456    pick! {
457      if #[cfg(target_feature="sse")] {
458        Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
459      } else if #[cfg(target_feature="simd128")] {
460        Self { simd: f32x4_lt(self.simd, rhs.simd) }
461      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
462        unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
463      } else {
464        Self { arr: [
465          if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
466          if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
467          if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
468          if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
469        ]}
470      }
471    }
472  }
473}
474
475impl f32x4 {
476  #[inline]
477  #[must_use]
478  pub fn new(array: [f32; 4]) -> Self {
479    Self::from(array)
480  }
481
482  #[inline]
483  #[must_use]
484  pub fn blend(self, t: Self, f: Self) -> Self {
485    pick! {
486      if #[cfg(target_feature="sse4.1")] {
487        Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
488      } else if #[cfg(target_feature="simd128")] {
489        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
490      } else {
491        generic_bit_blend(self, t, f)
492      }
493    }
494  }
495  #[inline]
496  #[must_use]
497  pub fn abs(self) -> Self {
498    pick! {
499      if #[cfg(target_feature="simd128")] {
500        Self { simd: f32x4_abs(self.simd) }
501      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
502        unsafe {Self { neon: vabsq_f32(self.neon) }}
503      } else {
504        let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
505        self & non_sign_bits
506      }
507    }
508  }
509
510  /// Calculates the lanewise maximum of both vectors. This is a faster
511  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
512  /// involved.
513  #[inline]
514  #[must_use]
515  pub fn fast_max(self, rhs: Self) -> Self {
516    pick! {
517      if #[cfg(target_feature="sse")] {
518        Self { sse: max_m128(self.sse, rhs.sse) }
519      } else if #[cfg(target_feature="simd128")] {
520        Self {
521          simd: f32x4_pmax(self.simd, rhs.simd),
522        }
523      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
524        unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
525      } else {
526        Self { arr: [
527          if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
528          if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
529          if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
530          if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
531        ]}
532      }
533    }
534  }
535
536  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
537  /// the other lane gets chosen. Use `fast_max` for a faster implementation
538  /// that doesn't handle NaNs.
539  #[inline]
540  #[must_use]
541  pub fn max(self, rhs: Self) -> Self {
542    pick! {
543      if #[cfg(target_feature="sse")] {
544        // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN
545        // involved, it chooses rhs, so we need to specifically check rhs for
546        // NaN.
547        rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
548      } else if #[cfg(target_feature="simd128")] {
549        // WASM has two max intrinsics:
550        // - max: This propagates NaN, that's the opposite of what we need.
551        // - pmax: This is defined as self < rhs ? rhs : self, which basically
552        //   chooses self if either is NaN.
553        //
554        // pmax is what we want, but we need to specifically check self for NaN.
555        Self {
556          simd: v128_bitselect(
557            rhs.simd,
558            f32x4_pmax(self.simd, rhs.simd),
559            f32x4_ne(self.simd, self.simd), // NaN check
560          )
561        }
562      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
563        unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
564      } else {
565        Self { arr: [
566          self.arr[0].max(rhs.arr[0]),
567          self.arr[1].max(rhs.arr[1]),
568          self.arr[2].max(rhs.arr[2]),
569          self.arr[3].max(rhs.arr[3]),
570        ]}
571      }
572    }
573  }
574
575  /// Calculates the lanewise minimum of both vectors. This is a faster
576  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
577  /// involved.
578  #[inline]
579  #[must_use]
580  pub fn fast_min(self, rhs: Self) -> Self {
581    pick! {
582      if #[cfg(target_feature="sse")] {
583        Self { sse: min_m128(self.sse, rhs.sse) }
584      } else if #[cfg(target_feature="simd128")] {
585        Self {
586          simd: f32x4_pmin(self.simd, rhs.simd),
587        }
588      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
589        unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
590      } else {
591        Self { arr: [
592          if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
593          if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
594          if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
595          if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
596        ]}
597      }
598    }
599  }
600
601  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
602  /// the other lane gets chosen. Use `fast_min` for a faster implementation
603  /// that doesn't handle NaNs.
604  #[inline]
605  #[must_use]
606  pub fn min(self, rhs: Self) -> Self {
607    pick! {
608      if #[cfg(target_feature="sse")] {
609        // min_m128 seems to do self < rhs ? self : rhs. So if there's any NaN
610        // involved, it chooses rhs, so we need to specifically check rhs for
611        // NaN.
612        rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
613      } else if #[cfg(target_feature="simd128")] {
614        // WASM has two min intrinsics:
615        // - min: This propagates NaN, that's the opposite of what we need.
616        // - pmin: This is defined as rhs < self ? rhs : self, which basically
617        //   chooses self if either is NaN.
618        //
619        // pmin is what we want, but we need to specifically check self for NaN.
620        Self {
621          simd: v128_bitselect(
622            rhs.simd,
623            f32x4_pmin(self.simd, rhs.simd),
624            f32x4_ne(self.simd, self.simd), // NaN check
625          )
626        }
627      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
628        unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
629      } else {
630        Self { arr: [
631          self.arr[0].min(rhs.arr[0]),
632          self.arr[1].min(rhs.arr[1]),
633          self.arr[2].min(rhs.arr[2]),
634          self.arr[3].min(rhs.arr[3]),
635        ]}
636      }
637    }
638  }
639  #[inline]
640  #[must_use]
641  pub fn is_nan(self) -> Self {
642    pick! {
643      if #[cfg(target_feature="sse")] {
644        Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
645      } else if #[cfg(target_feature="simd128")] {
646        Self { simd: f32x4_ne(self.simd, self.simd) }
647      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
648        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
649      } else {
650        Self { arr: [
651          if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
652          if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
653          if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
654          if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
655        ]}
656      }
657    }
658  }
659  #[inline]
660  #[must_use]
661  pub fn is_finite(self) -> Self {
662    let shifted_exp_mask = u32x4::from(0xFF000000);
663    let u: u32x4 = cast(self);
664    let shift_u = u << 1_u64;
665    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
666    cast(out)
667  }
668  #[inline]
669  #[must_use]
670  pub fn is_inf(self) -> Self {
671    let shifted_inf = u32x4::from(0xFF000000);
672    let u: u32x4 = cast(self);
673    let shift_u = u << 1_u64;
674    let out = (shift_u).cmp_eq(shifted_inf);
675    cast(out)
676  }
677
678  #[inline]
679  #[must_use]
680  pub fn round(self) -> Self {
681    pick! {
682      if #[cfg(target_feature="sse4.1")] {
683        Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
684      } else if #[cfg(target_feature="sse2")] {
685        let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
686        let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
687        let i: i32x4 = cast(mi);
688        let mask: f32x4 = cast(i.cmp_eq(i32x4::from(0x80000000_u32 as i32)));
689        mask.blend(self, f)
690      } else if #[cfg(target_feature="simd128")] {
691        Self { simd: f32x4_nearest(self.simd) }
692      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
693        unsafe {Self { neon: vrndnq_f32(self.neon) }}
694      } else {
695        // Note(Lokathor): This software fallback is probably very slow compared
696        // to having a hardware option available, even just the sse2 version is
697        // better than this. Oh well.
698        let to_int = f32x4::from(1.0 / f32::EPSILON);
699        let u: u32x4 = cast(self);
700        let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
701        let mut y: f32x4;
702
703        let no_op_magic = i32x4::from(0x7f + 23);
704        let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
705        let no_op_val: f32x4 = self;
706
707        let zero_magic = i32x4::from(0x7f - 1);
708        let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic));
709        let zero_val: f32x4 = self * f32x4::from(0.0);
710
711        let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default()));
712        let x: f32x4 = neg_bit.blend(-self, self);
713        y = x + to_int - to_int - x;
714        y = y.cmp_gt(f32x4::from(0.5)).blend(
715          y + x - f32x4::from(-1.0),
716          y.cmp_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
717        );
718        y = neg_bit.blend(-y, y);
719
720        no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
721      }
722    }
723  }
724
725  /// Rounds each lane into an integer. This is a faster implementation than
726  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
727  /// values you get implementation defined behavior.
728  #[inline]
729  #[must_use]
730  pub fn fast_round_int(self) -> i32x4 {
731    pick! {
732      if #[cfg(target_feature="sse2")] {
733        cast(convert_to_i32_m128i_from_m128(self.sse))
734      } else {
735        self.round_int()
736      }
737    }
738  }
739
740  /// Rounds each lane into an integer. This saturates out of range values and
741  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
742  /// doesn't handle out of range values or NaNs.
743  #[inline]
744  #[must_use]
745  pub fn round_int(self) -> i32x4 {
746    pick! {
747      if #[cfg(target_feature="sse2")] {
748        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
749        let non_nan_mask = self.cmp_eq(self);
750        let non_nan = self & non_nan_mask;
751        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
752        let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
753        flip_to_max ^ cast
754      } else if #[cfg(target_feature="simd128")] {
755        cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
756      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
757        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
758      } else {
759        let rounded: [f32; 4] = cast(self.round());
760        cast([
761          rounded[0] as i32,
762          rounded[1] as i32,
763          rounded[2] as i32,
764          rounded[3] as i32,
765        ])
766      }
767    }
768  }
769
770  /// Truncates each lane into an integer. This is a faster implementation than
771  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
772  /// values you get implementation defined behavior.
773  #[inline]
774  #[must_use]
775  pub fn fast_trunc_int(self) -> i32x4 {
776    pick! {
777      if #[cfg(target_feature="sse2")] {
778        cast(truncate_m128_to_m128i(self.sse))
779      } else {
780        self.trunc_int()
781      }
782    }
783  }
784
785  /// Truncates each lane into an integer. This saturates out of range values
786  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
787  /// that doesn't handle out of range values or NaNs.
788  #[inline]
789  #[must_use]
790  pub fn trunc_int(self) -> i32x4 {
791    pick! {
792      if #[cfg(target_feature="sse2")] {
793        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
794        let non_nan_mask = self.cmp_eq(self);
795        let non_nan = self & non_nan_mask;
796        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
797        let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
798        flip_to_max ^ cast
799      } else if #[cfg(target_feature="simd128")] {
800        cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
801      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
802        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
803      } else {
804        let n: [f32;4] = cast(self);
805        cast([
806          n[0] as i32,
807          n[1] as i32,
808          n[2] as i32,
809          n[3] as i32,
810        ])
811      }
812    }
813  }
814  #[inline]
815  #[must_use]
816  pub fn mul_add(self, m: Self, a: Self) -> Self {
817    pick! {
818      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
819        Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
820      } else {
821        (self * m) + a
822      }
823    }
824  }
825
826  #[inline]
827  #[must_use]
828  pub fn mul_sub(self, m: Self, s: Self) -> Self {
829    pick! {
830      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
831        Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
832      } else {
833        (self * m) - s
834      }
835    }
836  }
837
838  #[inline]
839  #[must_use]
840  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
841    pick! {
842      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
843        Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
844      } else {
845        a - (self * m)
846      }
847    }
848  }
849
850  #[inline]
851  #[must_use]
852  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
853    pick! {
854      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
855        Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, a.sse) }
856      } else {
857        -(self * m) - a
858      }
859    }
860  }
861
862  #[inline]
863  #[must_use]
864  pub fn flip_signs(self, signs: Self) -> Self {
865    self ^ (signs & Self::from(-0.0))
866  }
867
868  #[inline]
869  #[must_use]
870  pub fn copysign(self, sign: Self) -> Self {
871    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
872    (self & magnitude_mask) | (sign & Self::from(-0.0))
873  }
874
875  #[inline]
876  pub fn asin_acos(self) -> (Self, Self) {
877    // Based on the Agner Fog "vector class library":
878    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
879    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
880    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
881    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
882    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
883    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
884
885    let xa = self.abs();
886    let big = xa.cmp_ge(f32x4::splat(0.5));
887
888    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
889    let x2 = xa * xa;
890    let x3 = big.blend(x1, x2);
891
892    let xb = x1.sqrt();
893
894    let x4 = big.blend(xb, xa);
895
896    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
897    let z = z.mul_add(x3 * x4, x4);
898
899    let z1 = z + z;
900
901    // acos
902    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
903    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
904    let acos = big.blend(z3, z4);
905
906    // asin
907    let z3 = f32x4::FRAC_PI_2 - z1;
908    let asin = big.blend(z3, z);
909    let asin = asin.flip_signs(self);
910
911    (asin, acos)
912  }
913
914  #[inline]
915  pub fn asin(self) -> Self {
916    // Based on the Agner Fog "vector class library":
917    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
918    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
919    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
920    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
921    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
922    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
923
924    let xa = self.abs();
925    let big = xa.cmp_ge(f32x4::splat(0.5));
926
927    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
928    let x2 = xa * xa;
929    let x3 = big.blend(x1, x2);
930
931    let xb = x1.sqrt();
932
933    let x4 = big.blend(xb, xa);
934
935    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
936    let z = z.mul_add(x3 * x4, x4);
937
938    let z1 = z + z;
939
940    // asin
941    let z3 = f32x4::FRAC_PI_2 - z1;
942    let asin = big.blend(z3, z);
943    let asin = asin.flip_signs(self);
944
945    asin
946  }
947
948  #[inline]
949  #[must_use]
950  pub fn acos(self) -> Self {
951    // Based on the Agner Fog "vector class library":
952    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
953    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
954    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
955    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
956    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
957    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
958
959    let xa = self.abs();
960    let big = xa.cmp_ge(f32x4::splat(0.5));
961
962    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
963    let x2 = xa * xa;
964    let x3 = big.blend(x1, x2);
965
966    let xb = x1.sqrt();
967
968    let x4 = big.blend(xb, xa);
969
970    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
971    let z = z.mul_add(x3 * x4, x4);
972
973    let z1 = z + z;
974
975    // acos
976    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
977    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
978    let acos = big.blend(z3, z4);
979
980    acos
981  }
982
983  #[inline]
984  pub fn atan(self) -> Self {
985    // Based on the Agner Fog "vector class library":
986    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
987    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
988    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
989    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
990    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
991
992    let t = self.abs();
993
994    // small:  z = t / 1.0;
995    // medium: z = (t-1.0) / (t+1.0);
996    // big:    z = -1.0 / t;
997    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
998    let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
999
1000    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1001    s = notsmal & s;
1002
1003    let mut a = notbig & t;
1004    a = notsmal.blend(a - Self::ONE, a);
1005    let mut b = notbig & Self::ONE;
1006    b = notsmal.blend(b + t, b);
1007    let z = a / b;
1008
1009    let zz = z * z;
1010
1011    // Taylor expansion
1012    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1013    re = re.mul_add(zz * z, z) + s;
1014
1015    // get sign bit
1016    re = (self.sign_bit()).blend(-re, re);
1017
1018    re
1019  }
1020
1021  #[inline]
1022  pub fn atan2(self, x: Self) -> Self {
1023    // Based on the Agner Fog "vector class library":
1024    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1025    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1026    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1027    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1028    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1029
1030    let y = self;
1031
1032    // move in first octant
1033    let x1 = x.abs();
1034    let y1 = y.abs();
1035    let swapxy = y1.cmp_gt(x1);
1036    // swap x and y if y1 > x1
1037    let mut x2 = swapxy.blend(y1, x1);
1038    let mut y2 = swapxy.blend(x1, y1);
1039
1040    // check for special case: x and y are both +/- INF
1041    let both_infinite = x.is_inf() & y.is_inf();
1042    if both_infinite.any() {
1043      let minus_one = -Self::ONE;
1044      x2 = both_infinite.blend(x2 & minus_one, x2);
1045      y2 = both_infinite.blend(y2 & minus_one, y2);
1046    }
1047
1048    // x = y = 0 will produce NAN. No problem, fixed below
1049    let t = y2 / x2;
1050
1051    // small:  z = t / 1.0;
1052    // medium: z = (t-1.0) / (t+1.0);
1053    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1054
1055    let a = notsmal.blend(t - Self::ONE, t);
1056    let b = notsmal.blend(t + Self::ONE, Self::ONE);
1057    let s = notsmal & Self::FRAC_PI_4;
1058    let z = a / b;
1059
1060    let zz = z * z;
1061
1062    // Taylor expansion
1063    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1064    re = re.mul_add(zz * z, z) + s;
1065
1066    // move back in place
1067    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1068    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1069    re = (x.sign_bit()).blend(Self::PI - re, re);
1070
1071    // get sign bit
1072    re = (y.sign_bit()).blend(-re, re);
1073
1074    re
1075  }
1076
1077  #[inline]
1078  #[must_use]
1079  pub fn sin_cos(self) -> (Self, Self) {
1080    // Based on the Agner Fog "vector class library":
1081    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1082
1083    const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1084    const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1085    const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1086
1087    const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1088    const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1089    const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1090
1091    const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1092    const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1093    const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1094
1095    const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1096
1097    let xa = self.abs();
1098
1099    // Find quadrant
1100    let y = (xa * TWO_OVER_PI).round();
1101    let q: i32x4 = y.round_int();
1102
1103    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1104
1105    let x2 = x * x;
1106    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1107    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1108      + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1109
1110    let swap = !(q & i32x4::from(1)).cmp_eq(i32x4::from(0));
1111
1112    let mut overflow: f32x4 = cast(q.cmp_gt(i32x4::from(0x2000000)));
1113    overflow &= xa.is_finite();
1114    s = overflow.blend(f32x4::from(0.0), s);
1115    c = overflow.blend(f32x4::from(1.0), c);
1116
1117    // calc sin
1118    let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1119    let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1120    sin1 = sin1.flip_signs(cast(sign_sin));
1121
1122    // calc cos
1123    let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1124    let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1125    cos1 ^= cast::<_, f32x4>(sign_cos);
1126
1127    (sin1, cos1)
1128  }
1129
1130  #[inline]
1131  #[must_use]
1132  pub fn sin(self) -> Self {
1133    let (s, _) = self.sin_cos();
1134    s
1135  }
1136  #[inline]
1137  #[must_use]
1138  pub fn cos(self) -> Self {
1139    let (_, c) = self.sin_cos();
1140    c
1141  }
1142  #[inline]
1143  #[must_use]
1144  pub fn tan(self) -> Self {
1145    let (s, c) = self.sin_cos();
1146    s / c
1147  }
1148  #[inline]
1149  #[must_use]
1150  pub fn to_degrees(self) -> Self {
1151    const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1152    self * RAD_TO_DEG_RATIO
1153  }
1154  #[inline]
1155  #[must_use]
1156  pub fn to_radians(self) -> Self {
1157    const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1158    self * DEG_TO_RAD_RATIO
1159  }
1160  #[inline]
1161  #[must_use]
1162  pub fn recip(self) -> Self {
1163    pick! {
1164      if #[cfg(target_feature="sse")] {
1165        Self { sse: reciprocal_m128(self.sse) }
1166      } else if #[cfg(target_feature="simd128")] {
1167        Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1168      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1169        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1170      } else {
1171        Self { arr: [
1172          1.0 / self.arr[0],
1173          1.0 / self.arr[1],
1174          1.0 / self.arr[2],
1175          1.0 / self.arr[3],
1176        ]}
1177      }
1178    }
1179  }
1180  #[inline]
1181  #[must_use]
1182  pub fn recip_sqrt(self) -> Self {
1183    pick! {
1184      if #[cfg(target_feature="sse")] {
1185        Self { sse: reciprocal_sqrt_m128(self.sse) }
1186      } else if #[cfg(target_feature="simd128")] {
1187        Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1188      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1189        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1190      } else if #[cfg(feature="std")] {
1191        Self { arr: [
1192          1.0 / self.arr[0].sqrt(),
1193          1.0 / self.arr[1].sqrt(),
1194          1.0 / self.arr[2].sqrt(),
1195          1.0 / self.arr[3].sqrt(),
1196        ]}
1197      } else {
1198        Self { arr: [
1199          1.0 / software_sqrt(self.arr[0] as f64) as f32,
1200          1.0 / software_sqrt(self.arr[1] as f64) as f32,
1201          1.0 / software_sqrt(self.arr[2] as f64) as f32,
1202          1.0 / software_sqrt(self.arr[3] as f64) as f32,
1203        ]}
1204      }
1205    }
1206  }
1207  #[inline]
1208  #[must_use]
1209  pub fn sqrt(self) -> Self {
1210    pick! {
1211      if #[cfg(target_feature="sse")] {
1212        Self { sse: sqrt_m128(self.sse) }
1213      } else if #[cfg(target_feature="simd128")] {
1214        Self { simd: f32x4_sqrt(self.simd) }
1215      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1216        unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1217      } else if #[cfg(feature="std")] {
1218        Self { arr: [
1219          self.arr[0].sqrt(),
1220          self.arr[1].sqrt(),
1221          self.arr[2].sqrt(),
1222          self.arr[3].sqrt(),
1223        ]}
1224      } else {
1225        Self { arr: [
1226          software_sqrt(self.arr[0] as f64) as f32,
1227          software_sqrt(self.arr[1] as f64) as f32,
1228          software_sqrt(self.arr[2] as f64) as f32,
1229          software_sqrt(self.arr[3] as f64) as f32,
1230        ]}
1231      }
1232    }
1233  }
1234
1235  #[inline]
1236  #[must_use]
1237  pub fn move_mask(self) -> i32 {
1238    pick! {
1239      if #[cfg(target_feature="sse")] {
1240        move_mask_m128(self.sse)
1241      } else if #[cfg(target_feature="simd128")] {
1242        u32x4_bitmask(self.simd) as i32
1243      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1244        unsafe
1245        {
1246          // set all to 1 if top bit is set, else 0
1247          let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1248
1249          // select the right bit out of each lane
1250          let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]);
1251          let r = vandq_u32(masked, selectbit);
1252
1253          // horizontally add the 16-bit lanes
1254          vaddvq_u32(r) as i32
1255        }
1256      } else {
1257        (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 |
1258        (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 |
1259        (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 |
1260        (((self.arr[3].to_bits() as i32) < 0) as i32) << 3
1261      }
1262    }
1263  }
1264  #[inline]
1265  #[must_use]
1266  pub fn any(self) -> bool {
1267    pick! {
1268      if #[cfg(target_feature="simd128")] {
1269        v128_any_true(self.simd)
1270      } else {
1271        self.move_mask() != 0
1272      }
1273    }
1274  }
1275  #[inline]
1276  #[must_use]
1277  pub fn all(self) -> bool {
1278    pick! {
1279      if #[cfg(target_feature="simd128")] {
1280        u32x4_all_true(self.simd)
1281      } else {
1282        // four lanes
1283        self.move_mask() == 0b1111
1284      }
1285    }
1286  }
1287  #[inline]
1288  #[must_use]
1289  pub fn none(self) -> bool {
1290    !self.any()
1291  }
1292
1293  #[inline]
1294  fn vm_pow2n(self) -> Self {
1295    const_f32_as_f32x4!(pow2_23, 8388608.0);
1296    const_f32_as_f32x4!(bias, 127.0);
1297    let a = self + (bias + pow2_23);
1298    let c = cast::<_, i32x4>(a) << 23;
1299    cast::<_, f32x4>(c)
1300  }
1301
1302  /// Calculate the exponent of a packed `f32x4`
1303  #[inline]
1304  #[must_use]
1305  pub fn exp(self) -> Self {
1306    const_f32_as_f32x4!(P0, 1.0 / 2.0);
1307    const_f32_as_f32x4!(P1, 1.0 / 6.0);
1308    const_f32_as_f32x4!(P2, 1. / 24.);
1309    const_f32_as_f32x4!(P3, 1. / 120.);
1310    const_f32_as_f32x4!(P4, 1. / 720.);
1311    const_f32_as_f32x4!(P5, 1. / 5040.);
1312    const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1313    const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1314    let max_x = f32x4::from(87.3);
1315    let r = (self * Self::LOG2_E).round();
1316    let x = r.mul_neg_add(LN2D_HI, self);
1317    let x = r.mul_neg_add(LN2D_LO, x);
1318    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1319    let x2 = x * x;
1320    let z = z.mul_add(x2, x);
1321    let n2 = Self::vm_pow2n(r);
1322    let z = (z + Self::ONE) * n2;
1323    // check for overflow
1324    let in_range = self.abs().cmp_lt(max_x);
1325    let in_range = in_range & self.is_finite();
1326    in_range.blend(z, Self::ZERO)
1327  }
1328
1329  #[inline]
1330  fn exponent(self) -> f32x4 {
1331    const_f32_as_f32x4!(pow2_23, 8388608.0);
1332    const_f32_as_f32x4!(bias, 127.0);
1333    let a = cast::<_, u32x4>(self);
1334    let b = a >> 23;
1335    let c = b | cast::<_, u32x4>(pow2_23);
1336    let d = cast::<_, f32x4>(c);
1337    let e = d - (pow2_23 + bias);
1338    e
1339  }
1340
1341  #[inline]
1342  fn fraction_2(self) -> Self {
1343    let t1 = cast::<_, u32x4>(self);
1344    let t2 = cast::<_, u32x4>(
1345      (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1346    );
1347    cast::<_, f32x4>(t2)
1348  }
1349  #[inline]
1350  fn is_zero_or_subnormal(self) -> Self {
1351    let t = cast::<_, i32x4>(self);
1352    let t = t & i32x4::splat(0x7F800000);
1353    i32x4::round_float(t.cmp_eq(i32x4::splat(0)))
1354  }
1355  #[inline]
1356  fn infinity() -> Self {
1357    cast::<_, f32x4>(i32x4::splat(0x7F800000))
1358  }
1359  #[inline]
1360  fn nan_log() -> Self {
1361    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1362  }
1363  #[inline]
1364  fn nan_pow() -> Self {
1365    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1366  }
1367  #[inline]
1368  pub fn sign_bit(self) -> Self {
1369    let t1 = cast::<_, i32x4>(self);
1370    let t2 = t1 >> 31;
1371    !cast::<_, f32x4>(t2).cmp_eq(f32x4::ZERO)
1372  }
1373
1374  /// horizontal add of all the elements of the vector
1375  #[inline]
1376  #[must_use]
1377  pub fn reduce_add(self) -> f32 {
1378    let arr: [f32; 4] = cast(self);
1379    arr.iter().sum()
1380  }
1381
1382  /// Natural log (ln(x))
1383  #[inline]
1384  #[must_use]
1385  pub fn ln(self) -> Self {
1386    const_f32_as_f32x4!(HALF, 0.5);
1387    const_f32_as_f32x4!(P0, 3.3333331174E-1);
1388    const_f32_as_f32x4!(P1, -2.4999993993E-1);
1389    const_f32_as_f32x4!(P2, 2.0000714765E-1);
1390    const_f32_as_f32x4!(P3, -1.6668057665E-1);
1391    const_f32_as_f32x4!(P4, 1.4249322787E-1);
1392    const_f32_as_f32x4!(P5, -1.2420140846E-1);
1393    const_f32_as_f32x4!(P6, 1.1676998740E-1);
1394    const_f32_as_f32x4!(P7, -1.1514610310E-1);
1395    const_f32_as_f32x4!(P8, 7.0376836292E-2);
1396    const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1397    const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1398    const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1399
1400    let x1 = self;
1401    let x = Self::fraction_2(x1);
1402    let e = Self::exponent(x1);
1403    let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1404    let x = (!mask).blend(x + x, x);
1405    let fe = mask.blend(e + Self::ONE, e);
1406    let x = x - Self::ONE;
1407    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1408    let x2 = x * x;
1409    let res = x2 * x * res;
1410    let res = fe.mul_add(LN2F_LO, res);
1411    let res = res + x2.mul_neg_add(HALF, x);
1412    let res = fe.mul_add(LN2F_HI, res);
1413    let overflow = !self.is_finite();
1414    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1415    let mask = overflow | underflow;
1416    if !mask.any() {
1417      res
1418    } else {
1419      let is_zero = self.is_zero_or_subnormal();
1420      let res = underflow.blend(Self::nan_log(), res);
1421      let res = is_zero.blend(Self::infinity(), res);
1422      let res = overflow.blend(self, res);
1423      res
1424    }
1425  }
1426
1427  #[inline]
1428  #[must_use]
1429  pub fn log2(self) -> Self {
1430    Self::ln(self) * Self::LOG2_E
1431  }
1432  #[inline]
1433  #[must_use]
1434  pub fn log10(self) -> Self {
1435    Self::ln(self) * Self::LOG10_E
1436  }
1437
1438  #[inline]
1439  #[must_use]
1440  pub fn pow_f32x4(self, y: f32x4) -> Self {
1441    const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1442    const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1443    const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1444    const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1445    const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1446    const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1447    const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1448    const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1449    const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1450    const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1451    const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1452
1453    const_f32_as_f32x4!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1454    const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1455    const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1456    const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1457    const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1458    const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1459
1460    let x1 = self.abs();
1461    let x = x1.fraction_2();
1462
1463    let mask = x.cmp_gt(f32x4::SQRT_2 * f32x4::HALF);
1464    let x = (!mask).blend(x + x, x);
1465
1466    let x = x - f32x4::ONE;
1467    let x2 = x * x;
1468    let lg1 = polynomial_8!(
1469      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1470    );
1471    let lg1 = lg1 * x2 * x;
1472
1473    let ef = x1.exponent();
1474    let ef = mask.blend(ef + f32x4::ONE, ef);
1475
1476    let e1 = (ef * y).round();
1477    let yr = ef.mul_sub(y, e1);
1478
1479    let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1480    let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1481    let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1482
1483    let e2 = (lg * y * f32x4::LOG2_E).round();
1484    let v = lg.mul_sub(y, e2 * ln2f_hi);
1485    let v = e2.mul_neg_add(ln2f_lo, v);
1486    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1487
1488    let x = v;
1489    let e3 = (x * f32x4::LOG2_E).round();
1490    let x = e3.mul_neg_add(f32x4::LN_2, x);
1491    let x2 = x * x;
1492    let z = x2.mul_add(
1493      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1494      x + f32x4::ONE,
1495    );
1496
1497    let ee = e1 + e2 + e3;
1498    let ei = cast::<_, i32x4>(ee.round_int());
1499    let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1500
1501    let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF)))
1502      | (ee.cmp_gt(f32x4::splat(300.0)));
1503    let underflow = cast::<_, f32x4>(ej.cmp_lt(i32x4::splat(0x000)))
1504      | (ee.cmp_lt(f32x4::splat(-300.0)));
1505
1506    // Add exponent by integer addition
1507    let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1508
1509    // Check for overflow/underflow
1510    let z = if (overflow | underflow).any() {
1511      let z = underflow.blend(f32x4::ZERO, z);
1512      overflow.blend(Self::infinity(), z)
1513    } else {
1514      z
1515    };
1516
1517    // Check for self == 0
1518    let x_zero = self.is_zero_or_subnormal();
1519    let z = x_zero.blend(
1520      y.cmp_lt(f32x4::ZERO).blend(
1521        Self::infinity(),
1522        y.cmp_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1523      ),
1524      z,
1525    );
1526
1527    let x_sign = self.sign_bit();
1528    let z = if x_sign.any() {
1529      // Y into an integer
1530      let yi = y.cmp_eq(y.round());
1531      // Is y odd?
1532      let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1533
1534      let z1 =
1535        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1536      x_sign.blend(z1, z)
1537    } else {
1538      z
1539    };
1540
1541    let x_finite = self.is_finite();
1542    let y_finite = y.is_finite();
1543    let e_finite = ee.is_finite();
1544    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1545      return z;
1546    }
1547
1548    (self.is_nan() | y.is_nan()).blend(self + y, z)
1549  }
1550
1551  #[inline]
1552  pub fn powf(self, y: f32) -> Self {
1553    Self::pow_f32x4(self, f32x4::splat(y))
1554  }
1555
1556  #[inline]
1557  pub fn to_array(self) -> [f32; 4] {
1558    cast(self)
1559  }
1560
1561  #[inline]
1562  pub fn as_array_ref(&self) -> &[f32; 4] {
1563    cast_ref(self)
1564  }
1565
1566  #[inline]
1567  pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
1568    cast_mut(self)
1569  }
1570
1571  #[inline]
1572  pub fn from_i32x4(v: i32x4) -> Self {
1573    pick! {
1574      if #[cfg(target_feature="sse2")] {
1575        Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1576      } else if #[cfg(target_feature="simd128")] {
1577        Self { simd: f32x4_convert_i32x4(v.simd) }
1578      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1579        Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1580      } else {
1581        Self { arr: [
1582            v.as_array_ref()[0] as f32,
1583            v.as_array_ref()[1] as f32,
1584            v.as_array_ref()[2] as f32,
1585            v.as_array_ref()[3] as f32,
1586          ] }
1587      }
1588    }
1589  }
1590}
wide/f32x4_.rs

wide/
f32x4_.rs