1use super::*;
2
3pick! {
4 if #[cfg(target_feature="sse")] {
5 #[derive(Default, Clone, Copy, PartialEq)]
6 #[repr(C, align(16))]
7 pub struct f32x4 { pub(crate) sse: m128 }
8 } else if #[cfg(target_feature="simd128")] {
9 use core::arch::wasm32::*;
10
11 #[derive(Clone, Copy)]
12 #[repr(transparent)]
13 pub struct f32x4 { pub(crate) simd: v128 }
14
15 impl Default for f32x4 {
16 fn default() -> Self {
17 Self::splat(0.0)
18 }
19 }
20
21 impl PartialEq for f32x4 {
22 fn eq(&self, other: &Self) -> bool {
23 u32x4_all_true(f32x4_eq(self.simd, other.simd))
24 }
25 }
26 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27 use core::arch::aarch64::*;
28 #[repr(C)]
29 #[derive(Copy, Clone)]
30 pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32 impl Default for f32x4 {
33 #[inline]
34 #[must_use]
35 fn default() -> Self {
36 unsafe { Self { neon: vdupq_n_f32(0.0)} }
37 }
38 }
39
40 impl PartialEq for f32x4 {
41 #[inline]
42 #[must_use]
43 fn eq(&self, other: &Self) -> bool {
44 unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
45 }
46
47 }
48 } else {
49 #[derive(Default, Clone, Copy, PartialEq)]
50 #[repr(C, align(16))]
51 pub struct f32x4 { pub(crate) arr: [f32;4] }
52 }
53}
54
55macro_rules! const_f32_as_f32x4 {
56 ($i:ident, $f:expr) => {
57 #[allow(non_upper_case_globals)]
58 pub const $i: f32x4 =
59 unsafe { ConstUnionHack128bit { f32a4: [$f; 4] }.f32x4 };
60 };
61}
62
63impl f32x4 {
64 const_f32_as_f32x4!(ONE, 1.0);
65 const_f32_as_f32x4!(ZERO, 0.0);
66 const_f32_as_f32x4!(HALF, 0.5);
67 const_f32_as_f32x4!(E, core::f32::consts::E);
68 const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
69 const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
70 const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
71 const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
72 const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
73 const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
74 const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
75 const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
76 const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
77 const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
78 const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
79 const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
80 const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
81 const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
82 const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
83 const_f32_as_f32x4!(PI, core::f32::consts::PI);
84 const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
85 const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
86}
87
88unsafe impl Zeroable for f32x4 {}
89unsafe impl Pod for f32x4 {}
90
91impl Add for f32x4 {
92 type Output = Self;
93 #[inline]
94 #[must_use]
95 fn add(self, rhs: Self) -> Self::Output {
96 pick! {
97 if #[cfg(target_feature="sse")] {
98 Self { sse: add_m128(self.sse, rhs.sse) }
99 } else if #[cfg(target_feature="simd128")] {
100 Self { simd: f32x4_add(self.simd, rhs.simd) }
101 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102 unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
103 } else {
104 Self { arr: [
105 self.arr[0] + rhs.arr[0],
106 self.arr[1] + rhs.arr[1],
107 self.arr[2] + rhs.arr[2],
108 self.arr[3] + rhs.arr[3],
109 ]}
110 }
111 }
112 }
113}
114
115impl Sub for f32x4 {
116 type Output = Self;
117 #[inline]
118 #[must_use]
119 fn sub(self, rhs: Self) -> Self::Output {
120 pick! {
121 if #[cfg(target_feature="sse")] {
122 Self { sse: sub_m128(self.sse, rhs.sse) }
123 } else if #[cfg(target_feature="simd128")] {
124 Self { simd: f32x4_sub(self.simd, rhs.simd) }
125 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
126 unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
127 } else {
128 Self { arr: [
129 self.arr[0] - rhs.arr[0],
130 self.arr[1] - rhs.arr[1],
131 self.arr[2] - rhs.arr[2],
132 self.arr[3] - rhs.arr[3],
133 ]}
134 }
135 }
136 }
137}
138
139impl Mul for f32x4 {
140 type Output = Self;
141 #[inline]
142 #[must_use]
143 fn mul(self, rhs: Self) -> Self::Output {
144 pick! {
145 if #[cfg(target_feature="sse")] {
146 Self { sse: mul_m128(self.sse, rhs.sse) }
147 } else if #[cfg(target_feature="simd128")] {
148 Self { simd: f32x4_mul(self.simd, rhs.simd) }
149 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
150 unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
151 } else {
152 Self { arr: [
153 self.arr[0] * rhs.arr[0],
154 self.arr[1] * rhs.arr[1],
155 self.arr[2] * rhs.arr[2],
156 self.arr[3] * rhs.arr[3],
157 ]}
158 }
159 }
160 }
161}
162
163impl Div for f32x4 {
164 type Output = Self;
165 #[inline]
166 #[must_use]
167 fn div(self, rhs: Self) -> Self::Output {
168 pick! {
169 if #[cfg(target_feature="sse")] {
170 Self { sse: div_m128(self.sse, rhs.sse) }
171 } else if #[cfg(target_feature="simd128")] {
172 Self { simd: f32x4_div(self.simd, rhs.simd) }
173 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
174 unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
175 } else {
176 Self { arr: [
177 self.arr[0] / rhs.arr[0],
178 self.arr[1] / rhs.arr[1],
179 self.arr[2] / rhs.arr[2],
180 self.arr[3] / rhs.arr[3],
181 ]}
182 }
183 }
184 }
185}
186
187impl Add<f32> for f32x4 {
188 type Output = Self;
189 #[inline]
190 #[must_use]
191 fn add(self, rhs: f32) -> Self::Output {
192 self.add(Self::splat(rhs))
193 }
194}
195
196impl Sub<f32> for f32x4 {
197 type Output = Self;
198 #[inline]
199 #[must_use]
200 fn sub(self, rhs: f32) -> Self::Output {
201 self.sub(Self::splat(rhs))
202 }
203}
204
205impl Mul<f32> for f32x4 {
206 type Output = Self;
207 #[inline]
208 #[must_use]
209 fn mul(self, rhs: f32) -> Self::Output {
210 self.mul(Self::splat(rhs))
211 }
212}
213
214impl Div<f32> for f32x4 {
215 type Output = Self;
216 #[inline]
217 #[must_use]
218 fn div(self, rhs: f32) -> Self::Output {
219 self.div(Self::splat(rhs))
220 }
221}
222
223impl Add<f32x4> for f32 {
224 type Output = f32x4;
225 #[inline]
226 #[must_use]
227 fn add(self, rhs: f32x4) -> Self::Output {
228 f32x4::splat(self).add(rhs)
229 }
230}
231
232impl Sub<f32x4> for f32 {
233 type Output = f32x4;
234 #[inline]
235 #[must_use]
236 fn sub(self, rhs: f32x4) -> Self::Output {
237 f32x4::splat(self).sub(rhs)
238 }
239}
240
241impl Mul<f32x4> for f32 {
242 type Output = f32x4;
243 #[inline]
244 #[must_use]
245 fn mul(self, rhs: f32x4) -> Self::Output {
246 f32x4::splat(self).mul(rhs)
247 }
248}
249
250impl Div<f32x4> for f32 {
251 type Output = f32x4;
252 #[inline]
253 #[must_use]
254 fn div(self, rhs: f32x4) -> Self::Output {
255 f32x4::splat(self).div(rhs)
256 }
257}
258
259impl BitAnd for f32x4 {
260 type Output = Self;
261 #[inline]
262 #[must_use]
263 fn bitand(self, rhs: Self) -> Self::Output {
264 pick! {
265 if #[cfg(target_feature="sse")] {
266 Self { sse: bitand_m128(self.sse, rhs.sse) }
267 } else if #[cfg(target_feature="simd128")] {
268 Self { simd: v128_and(self.simd, rhs.simd) }
269 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
270 unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
271 } else {
272 Self { arr: [
273 f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
274 f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
275 f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
276 f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
277 ]}
278 }
279 }
280 }
281}
282
283impl BitOr for f32x4 {
284 type Output = Self;
285 #[inline]
286 #[must_use]
287 fn bitor(self, rhs: Self) -> Self::Output {
288 pick! {
289 if #[cfg(target_feature="sse")] {
290 Self { sse: bitor_m128(self.sse, rhs.sse) }
291 } else if #[cfg(target_feature="simd128")] {
292 Self { simd: v128_or(self.simd, rhs.simd) }
293 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
294 unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
295 } else {
296 Self { arr: [
297 f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
298 f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
299 f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
300 f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
301 ]}
302 }
303 }
304 }
305}
306
307impl BitXor for f32x4 {
308 type Output = Self;
309 #[inline]
310 #[must_use]
311 fn bitxor(self, rhs: Self) -> Self::Output {
312 pick! {
313 if #[cfg(target_feature="sse")] {
314 Self { sse: bitxor_m128(self.sse, rhs.sse) }
315 } else if #[cfg(target_feature="simd128")] {
316 Self { simd: v128_xor(self.simd, rhs.simd) }
317 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
318 unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
319 } else {
320 Self { arr: [
321 f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
322 f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
323 f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
324 f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
325 ]}
326 }
327 }
328 }
329}
330
331impl CmpEq for f32x4 {
332 type Output = Self;
333 #[inline]
334 #[must_use]
335 fn cmp_eq(self, rhs: Self) -> Self::Output {
336 pick! {
337 if #[cfg(target_feature="sse")] {
338 Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
339 } else if #[cfg(target_feature="simd128")] {
340 Self { simd: f32x4_eq(self.simd, rhs.simd) }
341 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
342 unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
343 } else {
344 Self { arr: [
345 if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
346 if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
347 if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
348 if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
349 ]}
350 }
351 }
352 }
353}
354
355impl CmpGe for f32x4 {
356 type Output = Self;
357 #[inline]
358 #[must_use]
359 fn cmp_ge(self, rhs: Self) -> Self::Output {
360 pick! {
361 if #[cfg(target_feature="sse")] {
362 Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
363 } else if #[cfg(target_feature="simd128")] {
364 Self { simd: f32x4_ge(self.simd, rhs.simd) }
365 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
366 unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
367 } else {
368 Self { arr: [
369 if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
370 if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
371 if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
372 if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
373 ]}
374 }
375 }
376 }
377}
378
379impl CmpGt for f32x4 {
380 type Output = Self;
381 #[inline]
382 #[must_use]
383 fn cmp_gt(self, rhs: Self) -> Self::Output {
384 pick! {
385 if #[cfg(target_feature="sse")] {
386 Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
387 } else if #[cfg(target_feature="simd128")] {
388 Self { simd: f32x4_gt(self.simd, rhs.simd) }
389 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
390 unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
391 } else {
392 Self { arr: [
393 if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
394 if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
395 if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
396 if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
397 ]}
398 }
399 }
400 }
401}
402
403impl CmpNe for f32x4 {
404 type Output = Self;
405 #[inline]
406 #[must_use]
407 fn cmp_ne(self, rhs: Self) -> Self::Output {
408 pick! {
409 if #[cfg(target_feature="sse")] {
410 Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
411 } else if #[cfg(target_feature="simd128")] {
412 Self { simd: f32x4_ne(self.simd, rhs.simd) }
413 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
414 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
415 } else {
416 Self { arr: [
417 if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
418 if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
419 if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
420 if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
421 ]}
422 }
423 }
424 }
425}
426
427impl CmpLe for f32x4 {
428 type Output = Self;
429 #[inline]
430 #[must_use]
431 fn cmp_le(self, rhs: Self) -> Self::Output {
432 pick! {
433 if #[cfg(target_feature="sse")] {
434 Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
435 } else if #[cfg(target_feature="simd128")] {
436 Self { simd: f32x4_le(self.simd, rhs.simd) }
437 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
438 unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
439 } else {
440 Self { arr: [
441 if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
442 if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
443 if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
444 if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
445 ]}
446 }
447 }
448 }
449}
450
451impl CmpLt for f32x4 {
452 type Output = Self;
453 #[inline]
454 #[must_use]
455 fn cmp_lt(self, rhs: Self) -> Self::Output {
456 pick! {
457 if #[cfg(target_feature="sse")] {
458 Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
459 } else if #[cfg(target_feature="simd128")] {
460 Self { simd: f32x4_lt(self.simd, rhs.simd) }
461 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
462 unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
463 } else {
464 Self { arr: [
465 if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
466 if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
467 if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
468 if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
469 ]}
470 }
471 }
472 }
473}
474
475impl f32x4 {
476 #[inline]
477 #[must_use]
478 pub fn new(array: [f32; 4]) -> Self {
479 Self::from(array)
480 }
481
482 #[inline]
483 #[must_use]
484 pub fn blend(self, t: Self, f: Self) -> Self {
485 pick! {
486 if #[cfg(target_feature="sse4.1")] {
487 Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
488 } else if #[cfg(target_feature="simd128")] {
489 Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
490 } else {
491 generic_bit_blend(self, t, f)
492 }
493 }
494 }
495 #[inline]
496 #[must_use]
497 pub fn abs(self) -> Self {
498 pick! {
499 if #[cfg(target_feature="simd128")] {
500 Self { simd: f32x4_abs(self.simd) }
501 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
502 unsafe {Self { neon: vabsq_f32(self.neon) }}
503 } else {
504 let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
505 self & non_sign_bits
506 }
507 }
508 }
509
510 #[inline]
514 #[must_use]
515 pub fn fast_max(self, rhs: Self) -> Self {
516 pick! {
517 if #[cfg(target_feature="sse")] {
518 Self { sse: max_m128(self.sse, rhs.sse) }
519 } else if #[cfg(target_feature="simd128")] {
520 Self {
521 simd: f32x4_pmax(self.simd, rhs.simd),
522 }
523 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
524 unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
525 } else {
526 Self { arr: [
527 if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
528 if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
529 if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
530 if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
531 ]}
532 }
533 }
534 }
535
536 #[inline]
540 #[must_use]
541 pub fn max(self, rhs: Self) -> Self {
542 pick! {
543 if #[cfg(target_feature="sse")] {
544 rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
548 } else if #[cfg(target_feature="simd128")] {
549 Self {
556 simd: v128_bitselect(
557 rhs.simd,
558 f32x4_pmax(self.simd, rhs.simd),
559 f32x4_ne(self.simd, self.simd), )
561 }
562 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
563 unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
564 } else {
565 Self { arr: [
566 self.arr[0].max(rhs.arr[0]),
567 self.arr[1].max(rhs.arr[1]),
568 self.arr[2].max(rhs.arr[2]),
569 self.arr[3].max(rhs.arr[3]),
570 ]}
571 }
572 }
573 }
574
575 #[inline]
579 #[must_use]
580 pub fn fast_min(self, rhs: Self) -> Self {
581 pick! {
582 if #[cfg(target_feature="sse")] {
583 Self { sse: min_m128(self.sse, rhs.sse) }
584 } else if #[cfg(target_feature="simd128")] {
585 Self {
586 simd: f32x4_pmin(self.simd, rhs.simd),
587 }
588 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
589 unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
590 } else {
591 Self { arr: [
592 if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
593 if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
594 if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
595 if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
596 ]}
597 }
598 }
599 }
600
601 #[inline]
605 #[must_use]
606 pub fn min(self, rhs: Self) -> Self {
607 pick! {
608 if #[cfg(target_feature="sse")] {
609 rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
613 } else if #[cfg(target_feature="simd128")] {
614 Self {
621 simd: v128_bitselect(
622 rhs.simd,
623 f32x4_pmin(self.simd, rhs.simd),
624 f32x4_ne(self.simd, self.simd), )
626 }
627 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
628 unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
629 } else {
630 Self { arr: [
631 self.arr[0].min(rhs.arr[0]),
632 self.arr[1].min(rhs.arr[1]),
633 self.arr[2].min(rhs.arr[2]),
634 self.arr[3].min(rhs.arr[3]),
635 ]}
636 }
637 }
638 }
639 #[inline]
640 #[must_use]
641 pub fn is_nan(self) -> Self {
642 pick! {
643 if #[cfg(target_feature="sse")] {
644 Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
645 } else if #[cfg(target_feature="simd128")] {
646 Self { simd: f32x4_ne(self.simd, self.simd) }
647 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
648 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
649 } else {
650 Self { arr: [
651 if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
652 if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
653 if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
654 if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
655 ]}
656 }
657 }
658 }
659 #[inline]
660 #[must_use]
661 pub fn is_finite(self) -> Self {
662 let shifted_exp_mask = u32x4::from(0xFF000000);
663 let u: u32x4 = cast(self);
664 let shift_u = u << 1_u64;
665 let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
666 cast(out)
667 }
668 #[inline]
669 #[must_use]
670 pub fn is_inf(self) -> Self {
671 let shifted_inf = u32x4::from(0xFF000000);
672 let u: u32x4 = cast(self);
673 let shift_u = u << 1_u64;
674 let out = (shift_u).cmp_eq(shifted_inf);
675 cast(out)
676 }
677
678 #[inline]
679 #[must_use]
680 pub fn round(self) -> Self {
681 pick! {
682 if #[cfg(target_feature="sse4.1")] {
683 Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
684 } else if #[cfg(target_feature="sse2")] {
685 let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
686 let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
687 let i: i32x4 = cast(mi);
688 let mask: f32x4 = cast(i.cmp_eq(i32x4::from(0x80000000_u32 as i32)));
689 mask.blend(self, f)
690 } else if #[cfg(target_feature="simd128")] {
691 Self { simd: f32x4_nearest(self.simd) }
692 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
693 unsafe {Self { neon: vrndnq_f32(self.neon) }}
694 } else {
695 let to_int = f32x4::from(1.0 / f32::EPSILON);
699 let u: u32x4 = cast(self);
700 let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
701 let mut y: f32x4;
702
703 let no_op_magic = i32x4::from(0x7f + 23);
704 let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
705 let no_op_val: f32x4 = self;
706
707 let zero_magic = i32x4::from(0x7f - 1);
708 let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic));
709 let zero_val: f32x4 = self * f32x4::from(0.0);
710
711 let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default()));
712 let x: f32x4 = neg_bit.blend(-self, self);
713 y = x + to_int - to_int - x;
714 y = y.cmp_gt(f32x4::from(0.5)).blend(
715 y + x - f32x4::from(-1.0),
716 y.cmp_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
717 );
718 y = neg_bit.blend(-y, y);
719
720 no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
721 }
722 }
723 }
724
725 #[inline]
729 #[must_use]
730 pub fn fast_round_int(self) -> i32x4 {
731 pick! {
732 if #[cfg(target_feature="sse2")] {
733 cast(convert_to_i32_m128i_from_m128(self.sse))
734 } else {
735 self.round_int()
736 }
737 }
738 }
739
740 #[inline]
744 #[must_use]
745 pub fn round_int(self) -> i32x4 {
746 pick! {
747 if #[cfg(target_feature="sse2")] {
748 let non_nan_mask = self.cmp_eq(self);
750 let non_nan = self & non_nan_mask;
751 let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
752 let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
753 flip_to_max ^ cast
754 } else if #[cfg(target_feature="simd128")] {
755 cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
756 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
757 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
758 } else {
759 let rounded: [f32; 4] = cast(self.round());
760 cast([
761 rounded[0] as i32,
762 rounded[1] as i32,
763 rounded[2] as i32,
764 rounded[3] as i32,
765 ])
766 }
767 }
768 }
769
770 #[inline]
774 #[must_use]
775 pub fn fast_trunc_int(self) -> i32x4 {
776 pick! {
777 if #[cfg(target_feature="sse2")] {
778 cast(truncate_m128_to_m128i(self.sse))
779 } else {
780 self.trunc_int()
781 }
782 }
783 }
784
785 #[inline]
789 #[must_use]
790 pub fn trunc_int(self) -> i32x4 {
791 pick! {
792 if #[cfg(target_feature="sse2")] {
793 let non_nan_mask = self.cmp_eq(self);
795 let non_nan = self & non_nan_mask;
796 let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
797 let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
798 flip_to_max ^ cast
799 } else if #[cfg(target_feature="simd128")] {
800 cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
801 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
802 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
803 } else {
804 let n: [f32;4] = cast(self);
805 cast([
806 n[0] as i32,
807 n[1] as i32,
808 n[2] as i32,
809 n[3] as i32,
810 ])
811 }
812 }
813 }
814 #[inline]
815 #[must_use]
816 pub fn mul_add(self, m: Self, a: Self) -> Self {
817 pick! {
818 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
819 Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
820 } else {
821 (self * m) + a
822 }
823 }
824 }
825
826 #[inline]
827 #[must_use]
828 pub fn mul_sub(self, m: Self, s: Self) -> Self {
829 pick! {
830 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
831 Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
832 } else {
833 (self * m) - s
834 }
835 }
836 }
837
838 #[inline]
839 #[must_use]
840 pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
841 pick! {
842 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
843 Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
844 } else {
845 a - (self * m)
846 }
847 }
848 }
849
850 #[inline]
851 #[must_use]
852 pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
853 pick! {
854 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
855 Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, a.sse) }
856 } else {
857 -(self * m) - a
858 }
859 }
860 }
861
862 #[inline]
863 #[must_use]
864 pub fn flip_signs(self, signs: Self) -> Self {
865 self ^ (signs & Self::from(-0.0))
866 }
867
868 #[inline]
869 #[must_use]
870 pub fn copysign(self, sign: Self) -> Self {
871 let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
872 (self & magnitude_mask) | (sign & Self::from(-0.0))
873 }
874
875 #[inline]
876 pub fn asin_acos(self) -> (Self, Self) {
877 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
880 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
881 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
882 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
883 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
884
885 let xa = self.abs();
886 let big = xa.cmp_ge(f32x4::splat(0.5));
887
888 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
889 let x2 = xa * xa;
890 let x3 = big.blend(x1, x2);
891
892 let xb = x1.sqrt();
893
894 let x4 = big.blend(xb, xa);
895
896 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
897 let z = z.mul_add(x3 * x4, x4);
898
899 let z1 = z + z;
900
901 let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
903 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
904 let acos = big.blend(z3, z4);
905
906 let z3 = f32x4::FRAC_PI_2 - z1;
908 let asin = big.blend(z3, z);
909 let asin = asin.flip_signs(self);
910
911 (asin, acos)
912 }
913
914 #[inline]
915 pub fn asin(self) -> Self {
916 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
919 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
920 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
921 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
922 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
923
924 let xa = self.abs();
925 let big = xa.cmp_ge(f32x4::splat(0.5));
926
927 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
928 let x2 = xa * xa;
929 let x3 = big.blend(x1, x2);
930
931 let xb = x1.sqrt();
932
933 let x4 = big.blend(xb, xa);
934
935 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
936 let z = z.mul_add(x3 * x4, x4);
937
938 let z1 = z + z;
939
940 let z3 = f32x4::FRAC_PI_2 - z1;
942 let asin = big.blend(z3, z);
943 let asin = asin.flip_signs(self);
944
945 asin
946 }
947
948 #[inline]
949 #[must_use]
950 pub fn acos(self) -> Self {
951 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
954 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
955 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
956 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
957 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
958
959 let xa = self.abs();
960 let big = xa.cmp_ge(f32x4::splat(0.5));
961
962 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
963 let x2 = xa * xa;
964 let x3 = big.blend(x1, x2);
965
966 let xb = x1.sqrt();
967
968 let x4 = big.blend(xb, xa);
969
970 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
971 let z = z.mul_add(x3 * x4, x4);
972
973 let z1 = z + z;
974
975 let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
977 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
978 let acos = big.blend(z3, z4);
979
980 acos
981 }
982
983 #[inline]
984 pub fn atan(self) -> Self {
985 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
988 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
989 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
990 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
991
992 let t = self.abs();
993
994 let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
998 let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
999
1000 let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1001 s = notsmal & s;
1002
1003 let mut a = notbig & t;
1004 a = notsmal.blend(a - Self::ONE, a);
1005 let mut b = notbig & Self::ONE;
1006 b = notsmal.blend(b + t, b);
1007 let z = a / b;
1008
1009 let zz = z * z;
1010
1011 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1013 re = re.mul_add(zz * z, z) + s;
1014
1015 re = (self.sign_bit()).blend(-re, re);
1017
1018 re
1019 }
1020
1021 #[inline]
1022 pub fn atan2(self, x: Self) -> Self {
1023 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1026 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1027 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1028 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1029
1030 let y = self;
1031
1032 let x1 = x.abs();
1034 let y1 = y.abs();
1035 let swapxy = y1.cmp_gt(x1);
1036 let mut x2 = swapxy.blend(y1, x1);
1038 let mut y2 = swapxy.blend(x1, y1);
1039
1040 let both_infinite = x.is_inf() & y.is_inf();
1042 if both_infinite.any() {
1043 let minus_one = -Self::ONE;
1044 x2 = both_infinite.blend(x2 & minus_one, x2);
1045 y2 = both_infinite.blend(y2 & minus_one, y2);
1046 }
1047
1048 let t = y2 / x2;
1050
1051 let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1054
1055 let a = notsmal.blend(t - Self::ONE, t);
1056 let b = notsmal.blend(t + Self::ONE, Self::ONE);
1057 let s = notsmal & Self::FRAC_PI_4;
1058 let z = a / b;
1059
1060 let zz = z * z;
1061
1062 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1064 re = re.mul_add(zz * z, z) + s;
1065
1066 re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1068 re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1069 re = (x.sign_bit()).blend(Self::PI - re, re);
1070
1071 re = (y.sign_bit()).blend(-re, re);
1073
1074 re
1075 }
1076
1077 #[inline]
1078 #[must_use]
1079 pub fn sin_cos(self) -> (Self, Self) {
1080 const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1084 const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1085 const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1086
1087 const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1088 const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1089 const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1090
1091 const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1092 const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1093 const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1094
1095 const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1096
1097 let xa = self.abs();
1098
1099 let y = (xa * TWO_OVER_PI).round();
1101 let q: i32x4 = y.round_int();
1102
1103 let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1104
1105 let x2 = x * x;
1106 let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1107 let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1108 + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1109
1110 let swap = !(q & i32x4::from(1)).cmp_eq(i32x4::from(0));
1111
1112 let mut overflow: f32x4 = cast(q.cmp_gt(i32x4::from(0x2000000)));
1113 overflow &= xa.is_finite();
1114 s = overflow.blend(f32x4::from(0.0), s);
1115 c = overflow.blend(f32x4::from(1.0), c);
1116
1117 let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1119 let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1120 sin1 = sin1.flip_signs(cast(sign_sin));
1121
1122 let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1124 let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1125 cos1 ^= cast::<_, f32x4>(sign_cos);
1126
1127 (sin1, cos1)
1128 }
1129
1130 #[inline]
1131 #[must_use]
1132 pub fn sin(self) -> Self {
1133 let (s, _) = self.sin_cos();
1134 s
1135 }
1136 #[inline]
1137 #[must_use]
1138 pub fn cos(self) -> Self {
1139 let (_, c) = self.sin_cos();
1140 c
1141 }
1142 #[inline]
1143 #[must_use]
1144 pub fn tan(self) -> Self {
1145 let (s, c) = self.sin_cos();
1146 s / c
1147 }
1148 #[inline]
1149 #[must_use]
1150 pub fn to_degrees(self) -> Self {
1151 const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1152 self * RAD_TO_DEG_RATIO
1153 }
1154 #[inline]
1155 #[must_use]
1156 pub fn to_radians(self) -> Self {
1157 const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1158 self * DEG_TO_RAD_RATIO
1159 }
1160 #[inline]
1161 #[must_use]
1162 pub fn recip(self) -> Self {
1163 pick! {
1164 if #[cfg(target_feature="sse")] {
1165 Self { sse: reciprocal_m128(self.sse) }
1166 } else if #[cfg(target_feature="simd128")] {
1167 Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1168 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1169 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1170 } else {
1171 Self { arr: [
1172 1.0 / self.arr[0],
1173 1.0 / self.arr[1],
1174 1.0 / self.arr[2],
1175 1.0 / self.arr[3],
1176 ]}
1177 }
1178 }
1179 }
1180 #[inline]
1181 #[must_use]
1182 pub fn recip_sqrt(self) -> Self {
1183 pick! {
1184 if #[cfg(target_feature="sse")] {
1185 Self { sse: reciprocal_sqrt_m128(self.sse) }
1186 } else if #[cfg(target_feature="simd128")] {
1187 Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1188 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1189 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1190 } else if #[cfg(feature="std")] {
1191 Self { arr: [
1192 1.0 / self.arr[0].sqrt(),
1193 1.0 / self.arr[1].sqrt(),
1194 1.0 / self.arr[2].sqrt(),
1195 1.0 / self.arr[3].sqrt(),
1196 ]}
1197 } else {
1198 Self { arr: [
1199 1.0 / software_sqrt(self.arr[0] as f64) as f32,
1200 1.0 / software_sqrt(self.arr[1] as f64) as f32,
1201 1.0 / software_sqrt(self.arr[2] as f64) as f32,
1202 1.0 / software_sqrt(self.arr[3] as f64) as f32,
1203 ]}
1204 }
1205 }
1206 }
1207 #[inline]
1208 #[must_use]
1209 pub fn sqrt(self) -> Self {
1210 pick! {
1211 if #[cfg(target_feature="sse")] {
1212 Self { sse: sqrt_m128(self.sse) }
1213 } else if #[cfg(target_feature="simd128")] {
1214 Self { simd: f32x4_sqrt(self.simd) }
1215 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1216 unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1217 } else if #[cfg(feature="std")] {
1218 Self { arr: [
1219 self.arr[0].sqrt(),
1220 self.arr[1].sqrt(),
1221 self.arr[2].sqrt(),
1222 self.arr[3].sqrt(),
1223 ]}
1224 } else {
1225 Self { arr: [
1226 software_sqrt(self.arr[0] as f64) as f32,
1227 software_sqrt(self.arr[1] as f64) as f32,
1228 software_sqrt(self.arr[2] as f64) as f32,
1229 software_sqrt(self.arr[3] as f64) as f32,
1230 ]}
1231 }
1232 }
1233 }
1234
1235 #[inline]
1236 #[must_use]
1237 pub fn move_mask(self) -> i32 {
1238 pick! {
1239 if #[cfg(target_feature="sse")] {
1240 move_mask_m128(self.sse)
1241 } else if #[cfg(target_feature="simd128")] {
1242 u32x4_bitmask(self.simd) as i32
1243 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1244 unsafe
1245 {
1246 let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1248
1249 let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]);
1251 let r = vandq_u32(masked, selectbit);
1252
1253 vaddvq_u32(r) as i32
1255 }
1256 } else {
1257 (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 |
1258 (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 |
1259 (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 |
1260 (((self.arr[3].to_bits() as i32) < 0) as i32) << 3
1261 }
1262 }
1263 }
1264 #[inline]
1265 #[must_use]
1266 pub fn any(self) -> bool {
1267 pick! {
1268 if #[cfg(target_feature="simd128")] {
1269 v128_any_true(self.simd)
1270 } else {
1271 self.move_mask() != 0
1272 }
1273 }
1274 }
1275 #[inline]
1276 #[must_use]
1277 pub fn all(self) -> bool {
1278 pick! {
1279 if #[cfg(target_feature="simd128")] {
1280 u32x4_all_true(self.simd)
1281 } else {
1282 self.move_mask() == 0b1111
1284 }
1285 }
1286 }
1287 #[inline]
1288 #[must_use]
1289 pub fn none(self) -> bool {
1290 !self.any()
1291 }
1292
1293 #[inline]
1294 fn vm_pow2n(self) -> Self {
1295 const_f32_as_f32x4!(pow2_23, 8388608.0);
1296 const_f32_as_f32x4!(bias, 127.0);
1297 let a = self + (bias + pow2_23);
1298 let c = cast::<_, i32x4>(a) << 23;
1299 cast::<_, f32x4>(c)
1300 }
1301
1302 #[inline]
1304 #[must_use]
1305 pub fn exp(self) -> Self {
1306 const_f32_as_f32x4!(P0, 1.0 / 2.0);
1307 const_f32_as_f32x4!(P1, 1.0 / 6.0);
1308 const_f32_as_f32x4!(P2, 1. / 24.);
1309 const_f32_as_f32x4!(P3, 1. / 120.);
1310 const_f32_as_f32x4!(P4, 1. / 720.);
1311 const_f32_as_f32x4!(P5, 1. / 5040.);
1312 const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1313 const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1314 let max_x = f32x4::from(87.3);
1315 let r = (self * Self::LOG2_E).round();
1316 let x = r.mul_neg_add(LN2D_HI, self);
1317 let x = r.mul_neg_add(LN2D_LO, x);
1318 let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1319 let x2 = x * x;
1320 let z = z.mul_add(x2, x);
1321 let n2 = Self::vm_pow2n(r);
1322 let z = (z + Self::ONE) * n2;
1323 let in_range = self.abs().cmp_lt(max_x);
1325 let in_range = in_range & self.is_finite();
1326 in_range.blend(z, Self::ZERO)
1327 }
1328
1329 #[inline]
1330 fn exponent(self) -> f32x4 {
1331 const_f32_as_f32x4!(pow2_23, 8388608.0);
1332 const_f32_as_f32x4!(bias, 127.0);
1333 let a = cast::<_, u32x4>(self);
1334 let b = a >> 23;
1335 let c = b | cast::<_, u32x4>(pow2_23);
1336 let d = cast::<_, f32x4>(c);
1337 let e = d - (pow2_23 + bias);
1338 e
1339 }
1340
1341 #[inline]
1342 fn fraction_2(self) -> Self {
1343 let t1 = cast::<_, u32x4>(self);
1344 let t2 = cast::<_, u32x4>(
1345 (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1346 );
1347 cast::<_, f32x4>(t2)
1348 }
1349 #[inline]
1350 fn is_zero_or_subnormal(self) -> Self {
1351 let t = cast::<_, i32x4>(self);
1352 let t = t & i32x4::splat(0x7F800000);
1353 i32x4::round_float(t.cmp_eq(i32x4::splat(0)))
1354 }
1355 #[inline]
1356 fn infinity() -> Self {
1357 cast::<_, f32x4>(i32x4::splat(0x7F800000))
1358 }
1359 #[inline]
1360 fn nan_log() -> Self {
1361 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1362 }
1363 #[inline]
1364 fn nan_pow() -> Self {
1365 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1366 }
1367 #[inline]
1368 pub fn sign_bit(self) -> Self {
1369 let t1 = cast::<_, i32x4>(self);
1370 let t2 = t1 >> 31;
1371 !cast::<_, f32x4>(t2).cmp_eq(f32x4::ZERO)
1372 }
1373
1374 #[inline]
1376 #[must_use]
1377 pub fn reduce_add(self) -> f32 {
1378 let arr: [f32; 4] = cast(self);
1379 arr.iter().sum()
1380 }
1381
1382 #[inline]
1384 #[must_use]
1385 pub fn ln(self) -> Self {
1386 const_f32_as_f32x4!(HALF, 0.5);
1387 const_f32_as_f32x4!(P0, 3.3333331174E-1);
1388 const_f32_as_f32x4!(P1, -2.4999993993E-1);
1389 const_f32_as_f32x4!(P2, 2.0000714765E-1);
1390 const_f32_as_f32x4!(P3, -1.6668057665E-1);
1391 const_f32_as_f32x4!(P4, 1.4249322787E-1);
1392 const_f32_as_f32x4!(P5, -1.2420140846E-1);
1393 const_f32_as_f32x4!(P6, 1.1676998740E-1);
1394 const_f32_as_f32x4!(P7, -1.1514610310E-1);
1395 const_f32_as_f32x4!(P8, 7.0376836292E-2);
1396 const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1397 const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1398 const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1399
1400 let x1 = self;
1401 let x = Self::fraction_2(x1);
1402 let e = Self::exponent(x1);
1403 let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1404 let x = (!mask).blend(x + x, x);
1405 let fe = mask.blend(e + Self::ONE, e);
1406 let x = x - Self::ONE;
1407 let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1408 let x2 = x * x;
1409 let res = x2 * x * res;
1410 let res = fe.mul_add(LN2F_LO, res);
1411 let res = res + x2.mul_neg_add(HALF, x);
1412 let res = fe.mul_add(LN2F_HI, res);
1413 let overflow = !self.is_finite();
1414 let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1415 let mask = overflow | underflow;
1416 if !mask.any() {
1417 res
1418 } else {
1419 let is_zero = self.is_zero_or_subnormal();
1420 let res = underflow.blend(Self::nan_log(), res);
1421 let res = is_zero.blend(Self::infinity(), res);
1422 let res = overflow.blend(self, res);
1423 res
1424 }
1425 }
1426
1427 #[inline]
1428 #[must_use]
1429 pub fn log2(self) -> Self {
1430 Self::ln(self) * Self::LOG2_E
1431 }
1432 #[inline]
1433 #[must_use]
1434 pub fn log10(self) -> Self {
1435 Self::ln(self) * Self::LOG10_E
1436 }
1437
1438 #[inline]
1439 #[must_use]
1440 pub fn pow_f32x4(self, y: f32x4) -> Self {
1441 const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1442 const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1443 const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1444 const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1445 const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1446 const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1447 const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1448 const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1449 const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1450 const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1451 const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1452
1453 const_f32_as_f32x4!(p2expf, 1.0 / 2.0); const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1455 const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1456 const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1457 const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1458 const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1459
1460 let x1 = self.abs();
1461 let x = x1.fraction_2();
1462
1463 let mask = x.cmp_gt(f32x4::SQRT_2 * f32x4::HALF);
1464 let x = (!mask).blend(x + x, x);
1465
1466 let x = x - f32x4::ONE;
1467 let x2 = x * x;
1468 let lg1 = polynomial_8!(
1469 x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1470 );
1471 let lg1 = lg1 * x2 * x;
1472
1473 let ef = x1.exponent();
1474 let ef = mask.blend(ef + f32x4::ONE, ef);
1475
1476 let e1 = (ef * y).round();
1477 let yr = ef.mul_sub(y, e1);
1478
1479 let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1480 let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1481 let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1482
1483 let e2 = (lg * y * f32x4::LOG2_E).round();
1484 let v = lg.mul_sub(y, e2 * ln2f_hi);
1485 let v = e2.mul_neg_add(ln2f_lo, v);
1486 let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1487
1488 let x = v;
1489 let e3 = (x * f32x4::LOG2_E).round();
1490 let x = e3.mul_neg_add(f32x4::LN_2, x);
1491 let x2 = x * x;
1492 let z = x2.mul_add(
1493 polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1494 x + f32x4::ONE,
1495 );
1496
1497 let ee = e1 + e2 + e3;
1498 let ei = cast::<_, i32x4>(ee.round_int());
1499 let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1500
1501 let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF)))
1502 | (ee.cmp_gt(f32x4::splat(300.0)));
1503 let underflow = cast::<_, f32x4>(ej.cmp_lt(i32x4::splat(0x000)))
1504 | (ee.cmp_lt(f32x4::splat(-300.0)));
1505
1506 let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1508
1509 let z = if (overflow | underflow).any() {
1511 let z = underflow.blend(f32x4::ZERO, z);
1512 overflow.blend(Self::infinity(), z)
1513 } else {
1514 z
1515 };
1516
1517 let x_zero = self.is_zero_or_subnormal();
1519 let z = x_zero.blend(
1520 y.cmp_lt(f32x4::ZERO).blend(
1521 Self::infinity(),
1522 y.cmp_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1523 ),
1524 z,
1525 );
1526
1527 let x_sign = self.sign_bit();
1528 let z = if x_sign.any() {
1529 let yi = y.cmp_eq(y.round());
1531 let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1533
1534 let z1 =
1535 yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1536 x_sign.blend(z1, z)
1537 } else {
1538 z
1539 };
1540
1541 let x_finite = self.is_finite();
1542 let y_finite = y.is_finite();
1543 let e_finite = ee.is_finite();
1544 if (x_finite & y_finite & (e_finite | x_zero)).all() {
1545 return z;
1546 }
1547
1548 (self.is_nan() | y.is_nan()).blend(self + y, z)
1549 }
1550
1551 #[inline]
1552 pub fn powf(self, y: f32) -> Self {
1553 Self::pow_f32x4(self, f32x4::splat(y))
1554 }
1555
1556 #[inline]
1557 pub fn to_array(self) -> [f32; 4] {
1558 cast(self)
1559 }
1560
1561 #[inline]
1562 pub fn as_array_ref(&self) -> &[f32; 4] {
1563 cast_ref(self)
1564 }
1565
1566 #[inline]
1567 pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
1568 cast_mut(self)
1569 }
1570
1571 #[inline]
1572 pub fn from_i32x4(v: i32x4) -> Self {
1573 pick! {
1574 if #[cfg(target_feature="sse2")] {
1575 Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1576 } else if #[cfg(target_feature="simd128")] {
1577 Self { simd: f32x4_convert_i32x4(v.simd) }
1578 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1579 Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1580 } else {
1581 Self { arr: [
1582 v.as_array_ref()[0] as f32,
1583 v.as_array_ref()[1] as f32,
1584 v.as_array_ref()[2] as f32,
1585 v.as_array_ref()[3] as f32,
1586 ] }
1587 }
1588 }
1589 }
1590}