wide/
f32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f32x8 { avx: m256 }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f32x8 { a : f32x4, b : f32x4 }
12  }
13}
14
15macro_rules! const_f32_as_f32x8 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f32x8 =
19      unsafe { ConstUnionHack256bit { f32a8: [$f; 8] }.f32x8 };
20  };
21}
22
23impl f32x8 {
24  const_f32_as_f32x8!(ONE, 1.0);
25  const_f32_as_f32x8!(HALF, 0.5);
26  const_f32_as_f32x8!(ZERO, 0.0);
27  const_f32_as_f32x8!(E, core::f32::consts::E);
28  const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
29  const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
30  const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
31  const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
32  const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
33  const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
34  const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
35  const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
36  const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
37  const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2);
38  const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10);
39  const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E);
40  const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E);
41  const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2);
42  const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10);
43  const_f32_as_f32x8!(PI, core::f32::consts::PI);
44  const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2);
45  const_f32_as_f32x8!(TAU, core::f32::consts::TAU);
46}
47
48unsafe impl Zeroable for f32x8 {}
49unsafe impl Pod for f32x8 {}
50
51impl Add for f32x8 {
52  type Output = Self;
53  #[inline]
54  #[must_use]
55  fn add(self, rhs: Self) -> Self::Output {
56    pick! {
57      if #[cfg(target_feature="avx")] {
58        Self { avx: add_m256(self.avx, rhs.avx) }
59      } else {
60        Self {
61          a : self.a.add(rhs.a),
62          b : self.b.add(rhs.b),
63        }
64      }
65    }
66  }
67}
68
69impl Sub for f32x8 {
70  type Output = Self;
71  #[inline]
72  #[must_use]
73  fn sub(self, rhs: Self) -> Self::Output {
74    pick! {
75      if #[cfg(target_feature="avx")] {
76        Self { avx: sub_m256(self.avx, rhs.avx) }
77      } else {
78        Self {
79          a : self.a.sub(rhs.a),
80          b : self.b.sub(rhs.b),
81        }
82      }
83    }
84  }
85}
86
87impl Mul for f32x8 {
88  type Output = Self;
89  #[inline]
90  #[must_use]
91  fn mul(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="avx")] {
94        Self { avx: mul_m256(self.avx, rhs.avx) }
95      } else {
96        Self {
97          a : self.a.mul(rhs.a),
98          b : self.b.mul(rhs.b),
99        }
100      }
101    }
102  }
103}
104
105impl Div for f32x8 {
106  type Output = Self;
107  #[inline]
108  #[must_use]
109  fn div(self, rhs: Self) -> Self::Output {
110    pick! {
111      if #[cfg(target_feature="avx")] {
112        Self { avx: div_m256(self.avx, rhs.avx) }
113      } else {
114        Self {
115          a : self.a.div(rhs.a),
116          b : self.b.div(rhs.b),
117        }
118      }
119    }
120  }
121}
122
123impl Add<f32> for f32x8 {
124  type Output = Self;
125  #[inline]
126  #[must_use]
127  fn add(self, rhs: f32) -> Self::Output {
128    self.add(Self::splat(rhs))
129  }
130}
131
132impl Sub<f32> for f32x8 {
133  type Output = Self;
134  #[inline]
135  #[must_use]
136  fn sub(self, rhs: f32) -> Self::Output {
137    self.sub(Self::splat(rhs))
138  }
139}
140
141impl Mul<f32> for f32x8 {
142  type Output = Self;
143  #[inline]
144  #[must_use]
145  fn mul(self, rhs: f32) -> Self::Output {
146    self.mul(Self::splat(rhs))
147  }
148}
149
150impl Div<f32> for f32x8 {
151  type Output = Self;
152  #[inline]
153  #[must_use]
154  fn div(self, rhs: f32) -> Self::Output {
155    self.div(Self::splat(rhs))
156  }
157}
158
159impl Add<f32x8> for f32 {
160  type Output = f32x8;
161  #[inline]
162  #[must_use]
163  fn add(self, rhs: f32x8) -> Self::Output {
164    f32x8::splat(self).add(rhs)
165  }
166}
167
168impl Sub<f32x8> for f32 {
169  type Output = f32x8;
170  #[inline]
171  #[must_use]
172  fn sub(self, rhs: f32x8) -> Self::Output {
173    f32x8::splat(self).sub(rhs)
174  }
175}
176
177impl Mul<f32x8> for f32 {
178  type Output = f32x8;
179  #[inline]
180  #[must_use]
181  fn mul(self, rhs: f32x8) -> Self::Output {
182    f32x8::splat(self).mul(rhs)
183  }
184}
185
186impl Div<f32x8> for f32 {
187  type Output = f32x8;
188  #[inline]
189  #[must_use]
190  fn div(self, rhs: f32x8) -> Self::Output {
191    f32x8::splat(self).div(rhs)
192  }
193}
194
195impl BitAnd for f32x8 {
196  type Output = Self;
197  #[inline]
198  #[must_use]
199  fn bitand(self, rhs: Self) -> Self::Output {
200    pick! {
201      if #[cfg(target_feature="avx")] {
202        Self { avx: bitand_m256(self.avx, rhs.avx) }
203      } else {
204        Self {
205          a : self.a.bitand(rhs.a),
206          b : self.b.bitand(rhs.b),
207        }
208      }
209    }
210  }
211}
212
213impl BitOr for f32x8 {
214  type Output = Self;
215  #[inline]
216  #[must_use]
217  fn bitor(self, rhs: Self) -> Self::Output {
218    pick! {
219      if #[cfg(target_feature="avx")] {
220        Self { avx: bitor_m256(self.avx, rhs.avx) }
221      } else {
222        Self {
223          a : self.a.bitor(rhs.a),
224          b : self.b.bitor(rhs.b),
225        }
226      }
227    }
228  }
229}
230
231impl BitXor for f32x8 {
232  type Output = Self;
233  #[inline]
234  #[must_use]
235  fn bitxor(self, rhs: Self) -> Self::Output {
236    pick! {
237      if #[cfg(target_feature="avx")] {
238        Self { avx: bitxor_m256(self.avx, rhs.avx) }
239      } else {
240        Self {
241          a : self.a.bitxor(rhs.a),
242          b : self.b.bitxor(rhs.b),
243        }
244      }
245    }
246  }
247}
248
249impl CmpEq for f32x8 {
250  type Output = Self;
251  #[inline]
252  #[must_use]
253  fn cmp_eq(self, rhs: Self) -> Self::Output {
254    pick! {
255      if #[cfg(target_feature="avx")] {
256        Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
257      } else {
258        Self {
259          a : self.a.cmp_eq(rhs.a),
260          b : self.b.cmp_eq(rhs.b),
261        }
262      }
263    }
264  }
265}
266
267impl CmpGe for f32x8 {
268  type Output = Self;
269  #[inline]
270  #[must_use]
271  fn cmp_ge(self, rhs: Self) -> Self::Output {
272    pick! {
273      if #[cfg(target_feature="avx")] {
274        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
275      } else {
276        Self {
277          a : self.a.cmp_ge(rhs.a),
278          b : self.b.cmp_ge(rhs.b),
279        }
280      }
281    }
282  }
283}
284
285impl CmpGt for f32x8 {
286  type Output = Self;
287  #[inline]
288  #[must_use]
289  fn cmp_gt(self, rhs: Self) -> Self::Output {
290    pick! {
291      if #[cfg(target_feature="avx")] {
292        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) }
293      } else {
294        Self {
295          a : self.a.cmp_gt(rhs.a),
296          b : self.b.cmp_gt(rhs.b),
297        }
298      }
299    }
300  }
301}
302
303impl CmpNe for f32x8 {
304  type Output = Self;
305  #[inline]
306  #[must_use]
307  fn cmp_ne(self, rhs: Self) -> Self::Output {
308    pick! {
309      if #[cfg(target_feature="avx")] {
310        Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
311      } else {
312        Self {
313          a : self.a.cmp_ne(rhs.a),
314          b : self.b.cmp_ne(rhs.b),
315        }
316      }
317    }
318  }
319}
320
321impl CmpLe for f32x8 {
322  type Output = Self;
323  #[inline]
324  #[must_use]
325  fn cmp_le(self, rhs: Self) -> Self::Output {
326    pick! {
327      if #[cfg(target_feature="avx")] {
328        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
329      } else {
330        Self {
331          a : self.a.cmp_le(rhs.a),
332          b : self.b.cmp_le(rhs.b),
333        }
334      }
335    }
336  }
337}
338
339impl CmpLt for f32x8 {
340  type Output = Self;
341  #[inline]
342  #[must_use]
343  fn cmp_lt(self, rhs: Self) -> Self::Output {
344    pick! {
345      if #[cfg(target_feature="avx")] {
346        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
347      } else {
348        Self {
349          a : self.a.cmp_lt(rhs.a),
350          b : self.b.cmp_lt(rhs.b),
351        }
352      }
353    }
354  }
355}
356
357impl f32x8 {
358  #[inline]
359  #[must_use]
360  pub fn new(array: [f32; 8]) -> Self {
361    Self::from(array)
362  }
363  #[inline]
364  #[must_use]
365  pub fn blend(self, t: Self, f: Self) -> Self {
366    pick! {
367      if #[cfg(target_feature="avx")] {
368        Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) }
369      } else {
370        Self {
371          a : self.a.blend(t.a, f.a),
372          b : self.b.blend(t.b, f.b),
373        }
374      }
375    }
376  }
377  #[inline]
378  #[must_use]
379  pub fn abs(self) -> Self {
380    pick! {
381      if #[cfg(target_feature="avx")] {
382        let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32));
383        self & non_sign_bits
384      } else {
385        Self {
386          a : self.a.abs(),
387          b : self.b.abs(),
388        }
389      }
390    }
391  }
392
393  /// Calculates the lanewise maximum of both vectors. This is a faster
394  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
395  /// involved.
396  #[inline]
397  #[must_use]
398  pub fn fast_max(self, rhs: Self) -> Self {
399    pick! {
400      if #[cfg(target_feature="avx")] {
401        Self { avx: max_m256(self.avx, rhs.avx) }
402      } else {
403        Self {
404          a : self.a.fast_max(rhs.a),
405          b : self.b.fast_max(rhs.b),
406        }
407      }
408    }
409  }
410
411  /// Calculates the lanewise maximum of both vectors. This doesn't match
412  /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`.
413  #[inline]
414  #[must_use]
415  pub fn max(self, rhs: Self) -> Self {
416    pick! {
417      if #[cfg(target_feature="avx")] {
418        // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN
419        // involved, it chooses rhs, so we need to specifically check rhs for
420        // NaN.
421        rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) })
422      } else {
423        Self {
424          a : self.a.max(rhs.a),
425          b : self.b.max(rhs.b),
426        }
427      }
428
429    }
430  }
431
432  /// Calculates the lanewise minimum of both vectors. This is a faster
433  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
434  /// involved.
435  #[inline]
436  #[must_use]
437  pub fn fast_min(self, rhs: Self) -> Self {
438    pick! {
439      if #[cfg(target_feature="avx")] {
440        Self { avx: min_m256(self.avx, rhs.avx) }
441      } else {
442        Self {
443          a : self.a.fast_min(rhs.a),
444          b : self.b.fast_min(rhs.b),
445        }
446      }
447    }
448  }
449
450  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
451  /// the other lane gets chosen. Use `fast_min` for a faster implementation
452  /// that doesn't handle NaNs.
453  #[inline]
454  #[must_use]
455  pub fn min(self, rhs: Self) -> Self {
456    pick! {
457      if #[cfg(target_feature="avx")] {
458        // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN
459        // involved, it chooses rhs, so we need to specifically check rhs for
460        // NaN.
461        rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) })
462      } else {
463        Self {
464          a : self.a.min(rhs.a),
465          b : self.b.min(rhs.b),
466        }
467      }
468    }
469  }
470  #[inline]
471  #[must_use]
472  pub fn is_nan(self) -> Self {
473    pick! {
474      if #[cfg(target_feature="avx")] {
475        Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) }
476      } else {
477        Self {
478          a : self.a.is_nan(),
479          b : self.b.is_nan(),
480        }
481      }
482    }
483  }
484  #[inline]
485  #[must_use]
486  pub fn is_finite(self) -> Self {
487    let shifted_exp_mask = u32x8::from(0xFF000000);
488    let u: u32x8 = cast(self);
489    let shift_u = u << 1_u64;
490    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
491    cast(out)
492  }
493  #[inline]
494  #[must_use]
495  pub fn is_inf(self) -> Self {
496    let shifted_inf = u32x8::from(0xFF000000);
497    let u: u32x8 = cast(self);
498    let shift_u = u << 1_u64;
499    let out = (shift_u).cmp_eq(shifted_inf);
500    cast(out)
501  }
502
503  #[inline]
504  #[must_use]
505  pub fn round(self) -> Self {
506    pick! {
507      // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out
508      if #[cfg(target_feature="avx")] {
509        Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) }
510      } else {
511        Self {
512          a : self.a.round(),
513          b : self.b.round(),
514        }
515      }
516    }
517  }
518
519  /// Rounds each lane into an integer. This is a faster implementation than
520  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
521  /// values you get implementation defined behavior.
522  #[inline]
523  #[must_use]
524  pub fn fast_round_int(self) -> i32x8 {
525    pick! {
526      if #[cfg(target_feature="avx")] {
527        cast(convert_to_i32_m256i_from_m256(self.avx))
528      } else {
529        cast([
530          self.a.fast_round_int(),
531          self.b.fast_round_int()])
532      }
533    }
534  }
535
536  /// Rounds each lane into an integer. This saturates out of range values and
537  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
538  /// doesn't handle out of range values or NaNs.
539  #[inline]
540  #[must_use]
541  pub fn round_int(self) -> i32x8 {
542    pick! {
543      if #[cfg(target_feature="avx")] {
544        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
545        let non_nan_mask = self.cmp_eq(self);
546        let non_nan = self & non_nan_mask;
547        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
548        let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
549        flip_to_max ^ cast
550      } else {
551        cast([
552          self.a.round_int(),
553          self.b.round_int(),
554        ])
555      }
556    }
557  }
558
559  /// Truncates each lane into an integer. This is a faster implementation than
560  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
561  /// values you get implementation defined behavior.
562  #[inline]
563  #[must_use]
564  pub fn fast_trunc_int(self) -> i32x8 {
565    pick! {
566      if #[cfg(all(target_feature="avx"))] {
567        cast(convert_truncate_to_i32_m256i_from_m256(self.avx))
568      } else {
569        cast([
570          self.a.fast_trunc_int(),
571          self.b.fast_trunc_int(),
572        ])
573      }
574    }
575  }
576
577  /// Truncates each lane into an integer. This saturates out of range values
578  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
579  /// that doesn't handle out of range values or NaNs.
580  #[inline]
581  #[must_use]
582  pub fn trunc_int(self) -> i32x8 {
583    pick! {
584        if #[cfg(target_feature="avx")] {
585        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
586        let non_nan_mask = self.cmp_eq(self);
587        let non_nan = self & non_nan_mask;
588        let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0)));
589        let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
590        flip_to_max ^ cast
591      } else {
592        cast([
593          self.a.trunc_int(),
594          self.b.trunc_int(),
595        ])
596      }
597    }
598  }
599  #[inline]
600  #[must_use]
601  pub fn mul_add(self, m: Self, a: Self) -> Self {
602    pick! {
603      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
604        Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) }
605      } else if #[cfg(target_feature="avx")] {
606        // still want to use 256 bit ops
607        (self * m) + a
608      } else {
609        Self {
610          a : self.a.mul_add(m.a, a.a),
611          b : self.b.mul_add(m.b, a.b),
612        }
613      }
614    }
615  }
616
617  #[inline]
618  #[must_use]
619  pub fn mul_sub(self, m: Self, a: Self) -> Self {
620    pick! {
621      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
622        Self { avx: fused_mul_sub_m256(self.avx, m.avx, a.avx) }
623      } else if #[cfg(target_feature="avx")] {
624        // still want to use 256 bit ops
625        (self * m) - a
626      } else {
627        Self {
628          a : self.a.mul_sub(m.a, a.a),
629          b : self.b.mul_sub(m.b, a.b),
630        }
631      }
632    }
633  }
634
635  #[inline]
636  #[must_use]
637  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
638    pick! {
639      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
640        Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) }
641      } else if #[cfg(target_feature="avx")] {
642        // still want to use 256 bit ops
643        a - (self * m)
644      } else {
645        Self {
646          a : self.a.mul_neg_add(m.a, a.a),
647          b : self.b.mul_neg_add(m.b, a.b),
648        }
649      }
650    }
651  }
652
653  #[inline]
654  #[must_use]
655  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
656    pick! {
657      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
658        Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, a.avx) }
659      } else if #[cfg(target_feature="avx")] {
660        // still want to use 256 bit ops
661        -(self * m) - a
662      } else {
663        Self {
664          a : self.a.mul_neg_sub(m.a, a.a),
665          b : self.b.mul_neg_sub(m.b, a.b),
666        }
667      }
668    }
669  }
670
671  #[inline]
672  #[must_use]
673  pub fn flip_signs(self, signs: Self) -> Self {
674    self ^ (signs & Self::from(-0.0))
675  }
676
677  #[inline]
678  #[must_use]
679  pub fn copysign(self, sign: Self) -> Self {
680    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
681    (self & magnitude_mask) | (sign & Self::from(-0.0))
682  }
683
684  #[inline]
685  pub fn asin_acos(self) -> (Self, Self) {
686    // Based on the Agner Fog "vector class library":
687    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
688    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
689    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
690    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
691    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
692    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
693
694    let xa = self.abs();
695    let big = xa.cmp_ge(f32x8::splat(0.5));
696
697    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
698    let x2 = xa * xa;
699    let x3 = big.blend(x1, x2);
700
701    let xb = x1.sqrt();
702
703    let x4 = big.blend(xb, xa);
704
705    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
706    let z = z.mul_add(x3 * x4, x4);
707
708    let z1 = z + z;
709
710    // acos
711    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
712    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
713    let acos = big.blend(z3, z4);
714
715    // asin
716    let z3 = f32x8::FRAC_PI_2 - z1;
717    let asin = big.blend(z3, z);
718    let asin = asin.flip_signs(self);
719
720    (asin, acos)
721  }
722
723  #[inline]
724  #[must_use]
725  pub fn asin(self) -> Self {
726    // Based on the Agner Fog "vector class library":
727    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
728    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
729    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
730    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
731    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
732    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
733
734    let xa = self.abs();
735    let big = xa.cmp_ge(f32x8::splat(0.5));
736
737    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
738    let x2 = xa * xa;
739    let x3 = big.blend(x1, x2);
740
741    let xb = x1.sqrt();
742
743    let x4 = big.blend(xb, xa);
744
745    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
746    let z = z.mul_add(x3 * x4, x4);
747
748    let z1 = z + z;
749
750    // asin
751    let z3 = f32x8::FRAC_PI_2 - z1;
752    let asin = big.blend(z3, z);
753    let asin = asin.flip_signs(self);
754
755    asin
756  }
757
758  #[inline]
759  #[must_use]
760  pub fn acos(self) -> Self {
761    // Based on the Agner Fog "vector class library":
762    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
763    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
764    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
765    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
766    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
767    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
768
769    let xa = self.abs();
770    let big = xa.cmp_ge(f32x8::splat(0.5));
771
772    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
773    let x2 = xa * xa;
774    let x3 = big.blend(x1, x2);
775
776    let xb = x1.sqrt();
777
778    let x4 = big.blend(xb, xa);
779
780    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
781    let z = z.mul_add(x3 * x4, x4);
782
783    let z1 = z + z;
784
785    // acos
786    let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
787    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
788    let acos = big.blend(z3, z4);
789
790    acos
791  }
792
793  #[inline]
794  pub fn atan(self) -> Self {
795    // Based on the Agner Fog "vector class library":
796    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
797    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
798    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
799    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
800    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
801
802    let t = self.abs();
803
804    // small:  z = t / 1.0;
805    // medium: z = (t-1.0) / (t+1.0);
806    // big:    z = -1.0 / t;
807    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
808    let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
809
810    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
811    s = notsmal & s;
812
813    let mut a = notbig & t;
814    a = notsmal.blend(a - Self::ONE, a);
815    let mut b = notbig & Self::ONE;
816    b = notsmal.blend(b + t, b);
817    let z = a / b;
818
819    let zz = z * z;
820
821    // Taylor expansion
822    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
823    re = re.mul_add(zz * z, z) + s;
824
825    // get sign bit
826    re = (self.sign_bit()).blend(-re, re);
827
828    re
829  }
830
831  #[inline]
832  pub fn atan2(self, x: Self) -> Self {
833    // Based on the Agner Fog "vector class library":
834    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
835    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
836    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
837    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
838    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
839
840    let y = self;
841
842    // move in first octant
843    let x1 = x.abs();
844    let y1 = y.abs();
845    let swapxy = y1.cmp_gt(x1);
846    // swap x and y if y1 > x1
847    let mut x2 = swapxy.blend(y1, x1);
848    let mut y2 = swapxy.blend(x1, y1);
849
850    // check for special case: x and y are both +/- INF
851    let both_infinite = x.is_inf() & y.is_inf();
852    if both_infinite.any() {
853      let minus_one = -Self::ONE;
854      x2 = both_infinite.blend(x2 & minus_one, x2);
855      y2 = both_infinite.blend(y2 & minus_one, y2);
856    }
857
858    // x = y = 0 will produce NAN. No problem, fixed below
859    let t = y2 / x2;
860
861    // small:  z = t / 1.0;
862    // medium: z = (t-1.0) / (t+1.0);
863    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
864
865    let a = notsmal.blend(t - Self::ONE, t);
866    let b = notsmal.blend(t + Self::ONE, Self::ONE);
867    let s = notsmal & Self::FRAC_PI_4;
868    let z = a / b;
869
870    let zz = z * z;
871
872    // Taylor expansion
873    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
874    re = re.mul_add(zz * z, z) + s;
875
876    // move back in place
877    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
878    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
879    re = (x.sign_bit()).blend(Self::PI - re, re);
880
881    // get sign bit
882    re = (y.sign_bit()).blend(-re, re);
883
884    re
885  }
886
887  #[inline]
888  #[must_use]
889  pub fn sin_cos(self) -> (Self, Self) {
890    // Based on the Agner Fog "vector class library":
891    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
892
893    const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0);
894    const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
895    const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
896
897    const_f32_as_f32x8!(P0sinf, -1.6666654611E-1);
898    const_f32_as_f32x8!(P1sinf, 8.3321608736E-3);
899    const_f32_as_f32x8!(P2sinf, -1.9515295891E-4);
900
901    const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2);
902    const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3);
903    const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5);
904
905    const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
906
907    let xa = self.abs();
908
909    // Find quadrant
910    let y = (xa * TWO_OVER_PI).round();
911    let q: i32x8 = y.round_int();
912
913    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
914
915    let x2 = x * x;
916    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
917    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
918      + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0));
919
920    let swap = !(q & i32x8::from(1)).cmp_eq(i32x8::from(0));
921
922    let mut overflow: f32x8 = cast(q.cmp_gt(i32x8::from(0x2000000)));
923    overflow &= xa.is_finite();
924    s = overflow.blend(f32x8::from(0.0), s);
925    c = overflow.blend(f32x8::from(1.0), c);
926
927    // calc sin
928    let mut sin1 = cast::<_, f32x8>(swap).blend(c, s);
929    let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self);
930    sin1 = sin1.flip_signs(cast(sign_sin));
931
932    // calc cos
933    let mut cos1 = cast::<_, f32x8>(swap).blend(s, c);
934    let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30;
935    cos1 ^= cast::<_, f32x8>(sign_cos);
936
937    (sin1, cos1)
938  }
939  #[inline]
940  #[must_use]
941  pub fn sin(self) -> Self {
942    let (s, _) = self.sin_cos();
943    s
944  }
945  #[inline]
946  #[must_use]
947  pub fn cos(self) -> Self {
948    let (_, c) = self.sin_cos();
949    c
950  }
951  #[inline]
952  #[must_use]
953  pub fn tan(self) -> Self {
954    let (s, c) = self.sin_cos();
955    s / c
956  }
957  #[inline]
958  #[must_use]
959  pub fn to_degrees(self) -> Self {
960    const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
961    self * RAD_TO_DEG_RATIO
962  }
963  #[inline]
964  #[must_use]
965  pub fn to_radians(self) -> Self {
966    const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
967    self * DEG_TO_RAD_RATIO
968  }
969  #[inline]
970  #[must_use]
971  pub fn recip(self) -> Self {
972    pick! {
973      if #[cfg(target_feature="avx")] {
974        Self { avx: reciprocal_m256(self.avx) }
975      } else {
976        Self {
977          a : self.a.recip(),
978          b : self.b.recip(),
979        }
980      }
981    }
982  }
983  #[inline]
984  #[must_use]
985  pub fn recip_sqrt(self) -> Self {
986    pick! {
987      if #[cfg(target_feature="avx")] {
988        Self { avx: reciprocal_sqrt_m256(self.avx) }
989      } else {
990        Self {
991          a : self.a.recip_sqrt(),
992          b : self.b.recip_sqrt(),
993        }
994      }
995    }
996  }
997  #[inline]
998  #[must_use]
999  pub fn sqrt(self) -> Self {
1000    pick! {
1001      if #[cfg(target_feature="avx")] {
1002        Self { avx: sqrt_m256(self.avx) }
1003      } else {
1004        Self {
1005          a : self.a.sqrt(),
1006          b : self.b.sqrt(),
1007        }
1008      }
1009    }
1010  }
1011  #[inline]
1012  #[must_use]
1013  pub fn move_mask(self) -> i32 {
1014    pick! {
1015      if #[cfg(target_feature="avx")] {
1016        move_mask_m256(self.avx)
1017      } else {
1018        (self.b.move_mask() << 4) | self.a.move_mask()
1019      }
1020    }
1021  }
1022  #[inline]
1023  #[must_use]
1024  pub fn any(self) -> bool {
1025    pick! {
1026      if #[cfg(target_feature="avx")] {
1027        move_mask_m256(self.avx) != 0
1028      } else {
1029        self.a.any() || self.b.any()
1030      }
1031    }
1032  }
1033  #[inline]
1034  #[must_use]
1035  pub fn all(self) -> bool {
1036    pick! {
1037      if #[cfg(target_feature="avx")] {
1038        move_mask_m256(self.avx) == 0b11111111
1039      } else {
1040        self.a.all() && self.b.all()
1041      }
1042    }
1043  }
1044  #[inline]
1045  #[must_use]
1046  pub fn none(self) -> bool {
1047    !self.any()
1048  }
1049
1050  #[inline]
1051  fn vm_pow2n(self) -> Self {
1052    const_f32_as_f32x8!(pow2_23, 8388608.0);
1053    const_f32_as_f32x8!(bias, 127.0);
1054    let a = self + (bias + pow2_23);
1055    let c = cast::<_, i32x8>(a) << 23;
1056    cast::<_, f32x8>(c)
1057  }
1058
1059  /// Calculate the exponent of a packed `f32x8`
1060  #[inline]
1061  #[must_use]
1062  pub fn exp(self) -> Self {
1063    const_f32_as_f32x8!(P0, 1.0 / 2.0);
1064    const_f32_as_f32x8!(P1, 1.0 / 6.0);
1065    const_f32_as_f32x8!(P2, 1. / 24.);
1066    const_f32_as_f32x8!(P3, 1. / 120.);
1067    const_f32_as_f32x8!(P4, 1. / 720.);
1068    const_f32_as_f32x8!(P5, 1. / 5040.);
1069    const_f32_as_f32x8!(LN2D_HI, 0.693359375);
1070    const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4);
1071    let max_x = f32x8::from(87.3);
1072    let r = (self * Self::LOG2_E).round();
1073    let x = r.mul_neg_add(LN2D_HI, self);
1074    let x = r.mul_neg_add(LN2D_LO, x);
1075    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1076    let x2 = x * x;
1077    let z = z.mul_add(x2, x);
1078    let n2 = Self::vm_pow2n(r);
1079    let z = (z + Self::ONE) * n2;
1080    // check for overflow
1081    let in_range = self.abs().cmp_lt(max_x);
1082    let in_range = in_range & self.is_finite();
1083    in_range.blend(z, Self::ZERO)
1084  }
1085
1086  #[inline]
1087  fn exponent(self) -> f32x8 {
1088    const_f32_as_f32x8!(pow2_23, 8388608.0);
1089    const_f32_as_f32x8!(bias, 127.0);
1090    let a = cast::<_, u32x8>(self);
1091    let b = a >> 23;
1092    let c = b | cast::<_, u32x8>(pow2_23);
1093    let d = cast::<_, f32x8>(c);
1094    let e = d - (pow2_23 + bias);
1095    e
1096  }
1097
1098  #[inline]
1099  fn fraction_2(self) -> Self {
1100    let t1 = cast::<_, u32x8>(self);
1101    let t2 = cast::<_, u32x8>(
1102      (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000),
1103    );
1104    cast::<_, f32x8>(t2)
1105  }
1106  #[inline]
1107  fn is_zero_or_subnormal(self) -> Self {
1108    let t = cast::<_, i32x8>(self);
1109    let t = t & i32x8::splat(0x7F800000);
1110    i32x8::round_float(t.cmp_eq(i32x8::splat(0)))
1111  }
1112  #[inline]
1113  fn infinity() -> Self {
1114    cast::<_, f32x8>(i32x8::splat(0x7F800000))
1115  }
1116  #[inline]
1117  fn nan_log() -> Self {
1118    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1119  }
1120  #[inline]
1121  fn nan_pow() -> Self {
1122    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1123  }
1124  #[inline]
1125  pub fn sign_bit(self) -> Self {
1126    let t1 = cast::<_, i32x8>(self);
1127    let t2 = t1 >> 31;
1128    !cast::<_, f32x8>(t2).cmp_eq(f32x8::ZERO)
1129  }
1130
1131  /// horizontal add of all the elements of the vector
1132  #[inline]
1133  #[must_use]
1134  pub fn reduce_add(self) -> f32 {
1135    pick! {
1136      // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally
1137      if #[cfg(target_feature="avx")]{
1138        let hi_quad = extract_m128_from_m256::<1>(self.avx);
1139        let lo_quad = cast_to_m128_from_m256(self.avx);
1140        let sum_quad = add_m128(lo_quad,hi_quad);
1141        let lo_dual = sum_quad;
1142        let hi_dual = move_high_low_m128(sum_quad,sum_quad);
1143        let sum_dual = add_m128(lo_dual,hi_dual);
1144        let lo = sum_dual;
1145        let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual);
1146        let sum = add_m128_s(lo, hi);
1147        get_f32_from_m128_s(sum)
1148      } else {
1149        self.a.reduce_add() + self.b.reduce_add()
1150      }
1151    }
1152  }
1153
1154  /// Natural log (ln(x))
1155  #[inline]
1156  #[must_use]
1157  pub fn ln(self) -> Self {
1158    const_f32_as_f32x8!(HALF, 0.5);
1159    const_f32_as_f32x8!(P0, 3.3333331174E-1);
1160    const_f32_as_f32x8!(P1, -2.4999993993E-1);
1161    const_f32_as_f32x8!(P2, 2.0000714765E-1);
1162    const_f32_as_f32x8!(P3, -1.6668057665E-1);
1163    const_f32_as_f32x8!(P4, 1.4249322787E-1);
1164    const_f32_as_f32x8!(P5, -1.2420140846E-1);
1165    const_f32_as_f32x8!(P6, 1.1676998740E-1);
1166    const_f32_as_f32x8!(P7, -1.1514610310E-1);
1167    const_f32_as_f32x8!(P8, 7.0376836292E-2);
1168    const_f32_as_f32x8!(LN2F_HI, 0.693359375);
1169    const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4);
1170    const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1171
1172    let x1 = self;
1173    let x = Self::fraction_2(x1);
1174    let e = Self::exponent(x1);
1175    let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1176    let x = (!mask).blend(x + x, x);
1177    let fe = mask.blend(e + Self::ONE, e);
1178    let x = x - Self::ONE;
1179    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1180    let x2 = x * x;
1181    let res = x2 * x * res;
1182    let res = fe.mul_add(LN2F_LO, res);
1183    let res = res + x2.mul_neg_add(HALF, x);
1184    let res = fe.mul_add(LN2F_HI, res);
1185    let overflow = !self.is_finite();
1186    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1187    let mask = overflow | underflow;
1188    if !mask.any() {
1189      res
1190    } else {
1191      let is_zero = self.is_zero_or_subnormal();
1192      let res = underflow.blend(Self::nan_log(), res);
1193      let res = is_zero.blend(Self::infinity(), res);
1194      let res = overflow.blend(self, res);
1195      res
1196    }
1197  }
1198
1199  #[inline]
1200  #[must_use]
1201  pub fn log2(self) -> Self {
1202    Self::ln(self) * Self::LOG2_E
1203  }
1204  #[inline]
1205  #[must_use]
1206  pub fn log10(self) -> Self {
1207    Self::ln(self) * Self::LOG10_E
1208  }
1209
1210  #[inline]
1211  #[must_use]
1212  pub fn pow_f32x8(self, y: Self) -> Self {
1213    const_f32_as_f32x8!(ln2f_hi, 0.693359375);
1214    const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4);
1215    const_f32_as_f32x8!(P0logf, 3.3333331174E-1);
1216    const_f32_as_f32x8!(P1logf, -2.4999993993E-1);
1217    const_f32_as_f32x8!(P2logf, 2.0000714765E-1);
1218    const_f32_as_f32x8!(P3logf, -1.6668057665E-1);
1219    const_f32_as_f32x8!(P4logf, 1.4249322787E-1);
1220    const_f32_as_f32x8!(P5logf, -1.2420140846E-1);
1221    const_f32_as_f32x8!(P6logf, 1.1676998740E-1);
1222    const_f32_as_f32x8!(P7logf, -1.1514610310E-1);
1223    const_f32_as_f32x8!(P8logf, 7.0376836292E-2);
1224
1225    const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1226    const_f32_as_f32x8!(p3expf, 1.0 / 6.0);
1227    const_f32_as_f32x8!(p4expf, 1.0 / 24.0);
1228    const_f32_as_f32x8!(p5expf, 1.0 / 120.0);
1229    const_f32_as_f32x8!(p6expf, 1.0 / 720.0);
1230    const_f32_as_f32x8!(p7expf, 1.0 / 5040.0);
1231
1232    let x1 = self.abs();
1233    let x = x1.fraction_2();
1234    let mask = x.cmp_gt(f32x8::SQRT_2 * f32x8::HALF);
1235    let x = (!mask).blend(x + x, x);
1236
1237    let x = x - f32x8::ONE;
1238    let x2 = x * x;
1239    let lg1 = polynomial_8!(
1240      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1241    );
1242    let lg1 = lg1 * x2 * x;
1243
1244    let ef = x1.exponent();
1245    let ef = mask.blend(ef + f32x8::ONE, ef);
1246    let e1 = (ef * y).round();
1247    let yr = ef.mul_sub(y, e1);
1248
1249    let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1;
1250    let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2);
1251    let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1;
1252
1253    let e2 = (lg * y * f32x8::LOG2_E).round();
1254    let v = lg.mul_sub(y, e2 * ln2f_hi);
1255    let v = e2.mul_neg_add(ln2f_lo, v);
1256    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2);
1257
1258    let x = v;
1259    let e3 = (x * f32x8::LOG2_E).round();
1260    let x = e3.mul_neg_add(f32x8::LN_2, x);
1261    let x2 = x * x;
1262    let z = x2.mul_add(
1263      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1264      x + f32x8::ONE,
1265    );
1266
1267    let ee = e1 + e2 + e3;
1268    let ei = cast::<_, i32x8>(ee.round_int());
1269    let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23));
1270
1271    let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF)))
1272      | (ee.cmp_gt(f32x8::splat(300.0)));
1273    let underflow = cast::<_, f32x8>(ej.cmp_lt(i32x8::splat(0x000)))
1274      | (ee.cmp_lt(f32x8::splat(-300.0)));
1275
1276    // Add exponent by integer addition
1277    let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23));
1278    // Check for overflow/underflow
1279    let z = underflow.blend(f32x8::ZERO, z);
1280    let z = overflow.blend(Self::infinity(), z);
1281
1282    // Check for self == 0
1283    let x_zero = self.is_zero_or_subnormal();
1284    let z = x_zero.blend(
1285      y.cmp_lt(f32x8::ZERO).blend(
1286        Self::infinity(),
1287        y.cmp_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO),
1288      ),
1289      z,
1290    );
1291
1292    let x_sign = self.sign_bit();
1293    let z = if x_sign.any() {
1294      // Y into an integer
1295      let yi = y.cmp_eq(y.round());
1296
1297      // Is y odd?
1298      let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float();
1299
1300      let z1 =
1301        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1302
1303      x_sign.blend(z1, z)
1304    } else {
1305      z
1306    };
1307
1308    let x_finite = self.is_finite();
1309    let y_finite = y.is_finite();
1310    let e_finite = ee.is_finite();
1311    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1312      return z;
1313    }
1314
1315    (self.is_nan() | y.is_nan()).blend(self + y, z)
1316  }
1317  #[inline]
1318  pub fn powf(self, y: f32) -> Self {
1319    Self::pow_f32x8(self, f32x8::splat(y))
1320  }
1321
1322  /// Transpose matrix of 8x8 `f32` matrix. Currently only accelerated on AVX.
1323  #[must_use]
1324  #[inline]
1325  pub fn transpose(data: [f32x8; 8]) -> [f32x8; 8] {
1326    pick! {
1327      if #[cfg(target_feature="avx")] {
1328        let a0 = unpack_lo_m256(data[0].avx, data[1].avx);
1329        let a1 = unpack_hi_m256(data[0].avx, data[1].avx);
1330        let a2 = unpack_lo_m256(data[2].avx, data[3].avx);
1331        let a3 = unpack_hi_m256(data[2].avx, data[3].avx);
1332        let a4 = unpack_lo_m256(data[4].avx, data[5].avx);
1333        let a5 = unpack_hi_m256(data[4].avx, data[5].avx);
1334        let a6 = unpack_lo_m256(data[6].avx, data[7].avx);
1335        let a7 = unpack_hi_m256(data[6].avx, data[7].avx);
1336
1337        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
1338          (z << 6) | (y << 4) | (x << 2) | w
1339        }
1340
1341        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
1342        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
1343
1344        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
1345        // (since blend runs on a different port than shuffle)
1346        let b0 = shuffle_m256::<SHUFF_LO>(a0,a2);
1347        let b1 = shuffle_m256::<SHUFF_HI>(a0,a2);
1348        let b2 = shuffle_m256::<SHUFF_LO>(a1,a3);
1349        let b3 = shuffle_m256::<SHUFF_HI>(a1,a3);
1350        let b4 = shuffle_m256::<SHUFF_LO>(a4,a6);
1351        let b5 = shuffle_m256::<SHUFF_HI>(a4,a6);
1352        let b6 = shuffle_m256::<SHUFF_LO>(a5,a7);
1353        let b7 = shuffle_m256::<SHUFF_HI>(a5,a7);
1354
1355        [
1356          f32x8 { avx: permute2z_m256::<0x20>(b0, b4) },
1357          f32x8 { avx: permute2z_m256::<0x20>(b1, b5) },
1358          f32x8 { avx: permute2z_m256::<0x20>(b2, b6) },
1359          f32x8 { avx: permute2z_m256::<0x20>(b3, b7) },
1360          f32x8 { avx: permute2z_m256::<0x31>(b0, b4) },
1361          f32x8 { avx: permute2z_m256::<0x31>(b1, b5) },
1362          f32x8 { avx: permute2z_m256::<0x31>(b2, b6) },
1363          f32x8 { avx: permute2z_m256::<0x31>(b3, b7) }
1364        ]
1365      } else {
1366        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
1367
1368        #[inline(always)]
1369        fn transpose_column(data: &[f32x8; 8], index: usize) -> f32x8 {
1370          f32x8::new([
1371            data[0].as_array_ref()[index],
1372            data[1].as_array_ref()[index],
1373            data[2].as_array_ref()[index],
1374            data[3].as_array_ref()[index],
1375            data[4].as_array_ref()[index],
1376            data[5].as_array_ref()[index],
1377            data[6].as_array_ref()[index],
1378            data[7].as_array_ref()[index],
1379          ])
1380        }
1381
1382        [
1383          transpose_column(&data, 0),
1384          transpose_column(&data, 1),
1385          transpose_column(&data, 2),
1386          transpose_column(&data, 3),
1387          transpose_column(&data, 4),
1388          transpose_column(&data, 5),
1389          transpose_column(&data, 6),
1390          transpose_column(&data, 7),
1391        ]
1392      }
1393    }
1394  }
1395
1396  #[inline]
1397  pub fn to_array(self) -> [f32; 8] {
1398    cast(self)
1399  }
1400
1401  #[inline]
1402  pub fn as_array_ref(&self) -> &[f32; 8] {
1403    cast_ref(self)
1404  }
1405
1406  #[inline]
1407  pub fn as_array_mut(&mut self) -> &mut [f32; 8] {
1408    cast_mut(self)
1409  }
1410
1411  #[inline]
1412  pub fn from_i32x8(v: i32x8) -> Self {
1413    pick! {
1414      if #[cfg(target_feature="avx2")] {
1415        Self { avx: convert_to_m256_from_i32_m256i(v.avx2) }
1416      } else {
1417        Self::new([
1418            v.as_array_ref()[0] as f32,
1419            v.as_array_ref()[1] as f32,
1420            v.as_array_ref()[2] as f32,
1421            v.as_array_ref()[3] as f32,
1422            v.as_array_ref()[4] as f32,
1423            v.as_array_ref()[5] as f32,
1424            v.as_array_ref()[6] as f32,
1425            v.as_array_ref()[7] as f32,
1426          ])
1427      }
1428    }
1429  }
1430}
1431
1432impl Not for f32x8 {
1433  type Output = Self;
1434  #[inline]
1435  fn not(self) -> Self {
1436    pick! {
1437      if #[cfg(target_feature="avx")] {
1438        Self { avx: self.avx.not()  }
1439      } else {
1440        Self {
1441          a : self.a.not(),
1442          b : self.b.not(),
1443        }
1444      }
1445    }
1446  }
1447}