28 #if !defined(SIMDE__SSE_H)
29 #if !defined(SIMDE__SSE_H)
34 #if defined(SIMDE_SSE_NATIVE)
35 #undef SIMDE_SSE_NATIVE
37 #if defined(SIMDE_SSE_FORCE_NATIVE)
38 #define SIMDE_SSE_NATIVE
39 #elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && \
40 !defined(SIMDE_NO_NATIVE)
41 #define SIMDE_SSE_NATIVE
42 #elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && \
43 !defined(SIMDE_NO_NEON)
44 #define SIMDE_SSE_NEON
47 #if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE)
48 #if defined(SIMDE_SSE_FORCE_NATIVE)
49 #error Native SSE support requires native MMX support
51 #warning Native SSE support requires native MMX support, disabling
52 #undef SIMDE_SSE_NATIVE
54 #elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON)
55 #warning SSE3 NEON support requires MMX NEON support, disabling
56 #undef SIMDE_SSE3_NEON
59 #if defined(SIMDE_SSE_NATIVE)
60 #include <xmmintrin.h>
62 #if defined(SIMDE_SSE_NEON)
66 #if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
67 (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
68 #include <stdatomic.h>
77 #define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
81 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
82 int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
83 int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
84 int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
85 int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
86 uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
87 uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
88 uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
89 uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
90 #if defined(SIMDE__HAVE_INT128)
91 simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
92 simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
94 simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
104 #if defined(SIMDE__HAVE_INT128)
105 simde_int128 i128[1];
106 simde_uint128 u128[1];
111 #if defined(SIMDE_SSE_NATIVE)
113 #elif defined(SIMDE_SSE_NEON)
122 float32x4_t neon_f32;
126 #if defined(SIMDE_SSE_NATIVE)
128 "__m128 size doesn't match simde__m128 size");
135 #elif defined(SIMDE_SSE_NEON)
136 #define SIMDE__M128_NEON_C(T, expr) \
137 (simde__m128) { .neon_##T = expr }
146 #if defined(SIMDE_SSE_NATIVE)
147 r.n = _mm_add_ps(a.n, b.n);
148 #elif defined(SIMDE_SSE_NEON)
149 r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32);
152 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
153 r.f32[i] = a.f32[i] + b.f32[i];
165 #if defined(SIMDE_SSE_NATIVE)
166 r.n = _mm_add_ss(a.n, b.n);
167 #elif defined(SIMDE_SSE_NEON)
168 float32_t b0 = vgetq_lane_f32(b.neon_f32, 0);
169 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
171 r.neon_f32 = vaddq_f32(a.neon_f32, value);
172 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
173 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
simde_mm_add_ps(a, b).f32,
176 r.f32[0] = a.f32[0] + b.f32[0];
190 #if defined(SIMDE_SSE_NATIVE)
191 r.n = _mm_and_ps(a.n, b.n);
192 #elif defined(SIMDE_SSE_NEON)
193 r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32);
196 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
197 r.i32[i] = a.i32[i] & b.i32[i];
209 #if defined(SIMDE_SSE_NATIVE)
210 r.n = _mm_andnot_ps(a.n, b.n);
211 #elif defined(SIMDE_SSE_NEON)
212 r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32);
215 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
216 r.i32[i] = ~(a.i32[i]) & b.i32[i];
228 #if defined(SIMDE_SSE_NATIVE)
229 r.n = _mm_avg_pu16(a.n, b.n);
230 #elif defined(SIMDE_SSE_NEON)
231 r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16);
234 for (
size_t i = 0; i < 4; i++) {
235 r.
u16[i] = (a.
u16[i] + b.
u16[i] + 1) >> 1;
241 #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
248 #if defined(SIMDE_SSE_NATIVE)
249 r.n = _mm_avg_pu8(a.n, b.n);
250 #elif defined(SIMDE_SSE_NEON)
251 r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8);
254 for (
size_t i = 0; i < 8; i++) {
255 r.
u8[i] = (a.
u8[i] + b.
u8[i] + 1) >> 1;
261 #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
268 #if defined(SIMDE_SSE_NATIVE)
269 r.n = _mm_cmpeq_ps(a.n, b.n);
270 #elif defined(SIMDE_SSE_NEON)
271 r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32);
274 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
275 r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0;
287 #if defined(SIMDE_SSE_NATIVE)
288 r.n = _mm_cmpeq_ss(a.n, b.n);
289 #elif defined(SIMDE_SSE_NEON)
291 vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
292 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
293 r.neon_f32 = vextq_f32(t, t, 3);
294 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
295 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
298 r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0;
300 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
313 #if defined(SIMDE_SSE_NATIVE)
314 r.n = _mm_cmpge_ps(a.n, b.n);
315 #elif defined(SIMDE_SSE_NEON)
316 r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
319 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
320 r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0;
332 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
333 r.n = _mm_cmpge_ss(a.n, b.n);
334 #elif defined(SIMDE_SSE_NEON)
336 vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32));
337 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
338 r.neon_f32 = vextq_f32(t, t, 3);
339 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
340 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
343 r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0;
345 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
358 #if defined(SIMDE_SSE_NATIVE)
359 r.n = _mm_cmpgt_ps(a.n, b.n);
360 #elif defined(SIMDE_SSE_NEON)
361 r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
364 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
365 r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0;
377 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
378 r.n = _mm_cmpgt_ss(a.n, b.n);
379 #elif defined(SIMDE_SSE_NEON)
381 vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
382 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
383 r.neon_f32 = vextq_f32(t, t, 3);
384 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
385 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
388 r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0;
390 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
403 #if defined(SIMDE_SSE_NATIVE)
404 r.n = _mm_cmple_ps(a.n, b.n);
405 #elif defined(SIMDE_SSE_NEON)
406 r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
409 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
410 r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0;
422 #if defined(SIMDE_SSE_NATIVE)
423 r.n = _mm_cmple_ss(a.n, b.n);
424 #elif defined(SIMDE_SSE_NEON)
426 vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
427 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
428 r.neon_f32 = vextq_f32(t, t, 3);
429 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
430 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
433 r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0;
435 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
448 #if defined(SIMDE_SSE_NATIVE)
449 r.n = _mm_cmplt_ps(a.n, b.n);
450 #elif defined(SIMDE_SSE_NEON)
451 r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
454 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
455 r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0;
467 #if defined(SIMDE_SSE_NATIVE)
468 r.n = _mm_cmplt_ss(a.n, b.n);
469 #elif defined(SIMDE_SSE_NEON)
471 vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
472 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
473 r.neon_f32 = vextq_f32(t, t, 3);
474 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
475 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
478 r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0;
480 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
493 #if defined(SIMDE_SSE_NATIVE)
494 r.n = _mm_cmpneq_ps(a.n, b.n);
495 #elif defined(SIMDE_SSE_NEON)
496 r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
499 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
500 r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0;
512 #if defined(SIMDE_SSE_NATIVE)
513 r.n = _mm_cmpneq_ss(a.n, b.n);
514 #elif defined(SIMDE_SSE_NEON)
516 vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
518 vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e)));
519 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
520 r.neon_f32 = vextq_f32(t, t, 3);
521 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
522 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
525 r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0;
527 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
540 #if defined(SIMDE_SSE_NATIVE)
541 r.n = _mm_cmpnge_ps(a.n, b.n);
542 #elif defined(SIMDE_SSE_NEON)
543 r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
556 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
557 r.n = _mm_cmpnge_ss(a.n, b.n);
558 #elif defined(SIMDE_SSE_NEON)
560 vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
561 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
562 r.neon_f32 = vextq_f32(t, t, 3);
575 #if defined(SIMDE_SSE_NATIVE)
576 r.n = _mm_cmpngt_ps(a.n, b.n);
577 #elif defined(SIMDE_SSE_NEON)
578 r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
591 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
592 r.n = _mm_cmpngt_ss(a.n, b.n);
593 #elif defined(SIMDE_SSE_NEON)
595 vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
596 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
597 r.neon_f32 = vextq_f32(t, t, 3);
610 #if defined(SIMDE_SSE_NATIVE)
611 r.n = _mm_cmpnle_ps(a.n, b.n);
612 #elif defined(SIMDE_SSE_NEON)
613 r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
626 #if defined(SIMDE_SSE_NATIVE)
627 r.n = _mm_cmpnle_ss(a.n, b.n);
628 #elif defined(SIMDE_SSE_NEON)
630 vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
631 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
632 r.neon_f32 = vextq_f32(t, t, 3);
645 #if defined(SIMDE_SSE_NATIVE)
646 r.n = _mm_cmpnlt_ps(a.n, b.n);
647 #elif defined(SIMDE_SSE_NEON)
648 r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
661 #if defined(SIMDE_SSE_NATIVE)
662 r.n = _mm_cmpnlt_ss(a.n, b.n);
675 #if defined(SIMDE_SSE_NATIVE)
676 r.n = _mm_cmpord_ps(a.n, b.n);
677 #elif defined(SIMDE_SSE_NEON)
681 uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
682 uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
683 r.neon_u32 = vandq_u32(ceqaa, ceqbb);
686 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
687 r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0
700 #if defined(SIMDE_SSE_NATIVE)
701 r.n = _mm_cmpord_ss(a.n, b.n);
702 #elif defined(SIMDE_SSE_NEON)
703 uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
704 uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
705 float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb));
706 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
707 r.neon_f32 = vextq_f32(t, t, 3);
708 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
709 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
712 r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff;
714 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
727 #if defined(SIMDE_SSE_NATIVE)
728 r.n = _mm_cmpunord_ps(a.n, b.n);
731 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
732 r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff
745 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
746 r.n = _mm_cmpunord_ss(a.n, b.n);
747 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
748 r.f32 = SIMDE__SHUFFLE_VECTOR(
751 r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0;
753 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
764 #if defined(SIMDE_SSE_NATIVE)
765 return _mm_comieq_ss(a.n, b.n);
766 #elif defined(SIMDE_SSE_NEON)
767 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
768 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
769 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
770 uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32);
771 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
773 return a.f32[0] == b.f32[0];
780 #if defined(SIMDE_SSE_NATIVE)
781 return _mm_comige_ss(a.n, b.n);
782 #elif defined(SIMDE_SSE_NEON)
783 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
784 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
785 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
786 uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32);
787 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1
790 return a.f32[0] >= b.f32[0];
797 #if defined(SIMDE_SSE_NATIVE)
798 return _mm_comigt_ss(a.n, b.n);
799 #elif defined(SIMDE_SSE_NEON)
800 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
801 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
802 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
803 uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32);
804 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1
807 return a.f32[0] > b.f32[0];
814 #if defined(SIMDE_SSE_NATIVE)
815 return _mm_comile_ss(a.n, b.n);
816 #elif defined(SIMDE_SSE_NEON)
817 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
818 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
819 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
820 uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32);
821 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
823 return a.f32[0] <= b.f32[0];
830 #if defined(SIMDE_SSE_NATIVE)
831 return _mm_comilt_ss(a.n, b.n);
832 #elif defined(SIMDE_SSE_NATIVE)
833 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
834 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
835 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
836 uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32);
837 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
839 return a.f32[0] < b.f32[0];
846 #if defined(SIMDE_SSE_NATIVE)
847 return _mm_comineq_ss(a.n, b.n);
848 #elif defined(SIMDE_SSE_NEON)
849 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
850 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
851 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
852 uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
853 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0)
857 return a.f32[0] != b.f32[0];
866 #if defined(SIMDE_SSE_NATIVE)
867 r.n = _mm_cvt_pi2ps(a.n, b.n);
883 #if defined(SIMDE_SSE_NATIVE)
884 r.n = _mm_cvt_ps2pi(a.n);
887 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
888 r.
i32[i] = (int32_t)a.f32[i];
900 #if defined(SIMDE_SSE_NATIVE)
901 r.n = _mm_cvt_si2ss(a.n, b);
915 #if defined(SIMDE_SSE_NATIVE)
916 return _mm_cvt_ss2si(a.n);
918 return (int32_t)a.f32[0];
927 #if defined(SIMDE_SSE_NATIVE)
928 r.n = _mm_cvtpi16_ps(a.n);
931 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
944 #if defined(SIMDE_SSE_NATIVE)
945 r.n = _mm_cvtpi32_ps(a.n, b.n);
961 #if defined(SIMDE_SSE_NATIVE)
962 r.n = _mm_cvtpi32x2_ps(a.n, b.n);
978 #if defined(SIMDE_SSE_NATIVE)
979 r.n = _mm_cvtpi8_ps(a.n);
995 #if defined(SIMDE_SSE_NATIVE)
996 r.n = _mm_cvtps_pi16(a.n);
999 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0])); i++) {
1000 r.
i16[i] = (int16_t)a.f32[i];
1012 #if defined(SIMDE_SSE_NATIVE)
1013 r.n = _mm_cvtps_pi32(a.n);
1016 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
1017 r.
i32[i] = (int32_t)a.f32[i];
1029 #if defined(SIMDE_SSE_NATIVE)
1030 r.n = _mm_cvtps_pi8(a.n);
1033 for (
size_t i = 0; i < (
sizeof(a.f32) /
sizeof(a.f32[0])); i++) {
1034 r.
i8[i] = (int8_t)a.f32[i];
1046 #if defined(SIMDE_SSE_NATIVE)
1047 r.n = _mm_cvtpu16_ps(a.n);
1050 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1063 #if defined(SIMDE_SSE_NATIVE)
1064 r.n = _mm_cvtpu8_ps(a.n);
1067 for (
size_t i = 0; i < 4; i++) {
1080 #if defined(SIMDE_SSE_NATIVE)
1081 r.n = _mm_cvtsi32_ss(a.n, b);
1085 for (
size_t i = 1; i < 4; i++) {
1086 r.i32[i] = a.i32[i];
1098 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1100 r.n = _mm_cvtsi64_ss(a.n, b);
1102 r.n = _mm_cvtsi64x_ss(a.n, b);
1107 for (
size_t i = 1; i < 4; i++) {
1108 r.i32[i] = a.i32[i];
1118 #if defined(SIMDE_SSE_NATIVE)
1119 return _mm_cvtss_f32(a.n);
1120 #elif defined(SIMDE_SSE_NEON)
1121 return vgetq_lane_f32(a.neon_f32, 0);
1130 #if defined(SIMDE_SSE_NATIVE)
1131 return _mm_cvtss_si32(a.n);
1133 return (int32_t)a.f32[0];
1140 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1142 return _mm_cvtss_si64(a.n);
1144 return _mm_cvtss_si64x(a.n);
1147 return (int64_t)a.f32[0];
1156 #if defined(SIMDE_SSE_NATIVE)
1157 r.n = _mm_cvtt_ps2pi(a.n);
1160 for (
size_t i = 0; i < (
sizeof(r.
f32) /
sizeof(r.
f32[0])); i++) {
1161 r.
i32[i] = (int32_t)truncf(a.f32[i]);
1171 #if defined(SIMDE_SSE_NATIVE)
1172 return _mm_cvtt_ss2si(a.n);
1174 return (int32_t)truncf(a.f32[0]);
1183 #if defined(SIMDE_SSE_NATIVE)
1184 r.n = _mm_cvttps_pi32(a.n);
1195 #if defined(SIMDE_SSE_NATIVE)
1196 return _mm_cvttss_si32(a.n);
1198 return (int32_t)truncf(a.f32[0]);
1205 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1207 return _mm_cvttss_si64x(a.n);
1209 return _mm_cvttss_si64(a.n);
1212 return (int64_t)truncf(a.f32[0]);
1221 #if defined(SIMDE_SSE_NATIVE)
1222 r.n = _mm_div_ps(a.n, b.n);
1223 #elif defined(SIMDE_SSE_NEON)
1224 float32x4_t recip0 = vrecpeq_f32(b.neon_f32);
1225 float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32));
1226 r.neon_f32 = vmulq_f32(a.neon_f32, recip1);
1229 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1230 r.f32[i] = a.f32[i] / b.f32[i];
1242 #if defined(SIMDE_SSE_NATIVE)
1243 r.n = _mm_div_ss(a.n, b.n);
1244 #elif defined(SIMDE_SSE_NEON)
1246 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1248 r.f32[0] = a.f32[0] / b.f32[0];
1250 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1251 r.f32[i] = a.f32[i];
1263 #if defined(SIMDE_SSE_NATIVE)
1264 #define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8)
1266 #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8)
1269 #if defined(SIMDE_SSE_NATIVE)
1276 #if defined(FE_TONEAREST)
1282 #if defined(FE_DOWNWARD)
1288 #if defined(FE_UPWARD)
1294 #if defined(FE_TOWARDZERO)
1303 #if defined(SIMDE_SSE_NATIVE)
1304 return _MM_GET_ROUNDING_MODE();
1306 return fegetround();
1313 #if defined(SIMDE_SSE_NATIVE)
1314 _MM_SET_ROUNDING_MODE(a);
1328 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
1329 #define simde_mm_insert_pi16(a, i, imm8) \
1330 SIMDE__M64_C(_mm_insert_pi16((a).n, i, imm8));
1332 #define simde_m_pinsrw(a, i, imm8) \
1333 SIMDE__M64_C(simde_mm_insert_pi16((a).n, i, imm8));
1343 #if defined(SIMDE_SSE_NATIVE)
1344 r.n = _mm_load_ps(mem_addr);
1345 #elif defined(SIMDE_SSE_NEON)
1346 r.neon_f32 = vld1q_f32(mem_addr);
1348 memcpy(&r, mem_addr,
sizeof(r.f32));
1359 #if defined(SIMDE_SSE_NATIVE)
1360 r.n = _mm_load_ps1(mem_addr);
1364 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1377 #if defined(SIMDE_SSE_NATIVE)
1378 r.n = _mm_load_ss(mem_addr);
1379 #elif defined(SIMDE_SSE_NEON)
1380 r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
1382 r.f32[0] = *mem_addr;
1396 #if defined(SIMDE_SSE_NATIVE)
1397 r.n = _mm_load1_ps(mem_addr);
1398 #elif defined(SIMDE_SSE_NEON)
1399 r.neon_f32 = vld1q_dup_f32(mem_addr);
1412 #if defined(SIMDE_SSE_NATIVE)
1413 r.n = _mm_loadh_pi(a.n, (__m64 *)mem_addr);
1415 r.f32[0] = a.f32[0];
1416 r.f32[1] = a.f32[1];
1417 r.f32[2] = mem_addr->f32[0];
1418 r.f32[3] = mem_addr->f32[1];
1429 #if defined(SIMDE_SSE_NATIVE)
1430 r.n = _mm_loadl_pi(a.n, (__m64 *)mem_addr);
1432 r.f32[0] = mem_addr->f32[0];
1433 r.f32[1] = mem_addr->f32[1];
1434 r.f32[2] = a.f32[2];
1435 r.f32[3] = a.f32[3];
1449 #if defined(SIMDE_SSE_NATIVE)
1450 r.n = _mm_loadr_ps(mem_addr);
1452 r.f32[0] = mem_addr[3];
1453 r.f32[1] = mem_addr[2];
1454 r.f32[2] = mem_addr[1];
1455 r.f32[3] = mem_addr[0];
1467 #if defined(SIMDE_SSE_NATIVE)
1468 r.n = _mm_loadu_ps(mem_addr);
1469 #elif defined(SIMDE_SSE_NEON)
1470 r.neon_f32 = vld1q_f32(mem_addr);
1472 r.f32[0] = mem_addr[0];
1473 r.f32[1] = mem_addr[1];
1474 r.f32[2] = mem_addr[2];
1475 r.f32[3] = mem_addr[3];
1484 #if defined(SIMDE_SSE_NATIVE)
1485 _mm_maskmove_si64(a.n, mask.n, mem_addr);
1488 for (
size_t i = 0; i < (
sizeof(a.
i8) /
sizeof(a.
i8[0])); i++)
1490 mem_addr[i] = a.
i8[i];
1493 #define simde_m_maskmovq(a, mask, mem_addr) \
1494 simde_mm_maskmove_si64(a, mask, mem_addr)
1501 #if defined(SIMDE_SSE_NATIVE)
1502 r.n = _mm_max_pi16(a.n, b.n);
1505 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0])); i++) {
1512 #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
1519 #if defined(SIMDE_SSE_NATIVE)
1520 r.n = _mm_max_ps(a.n, b.n);
1521 #elif defined(SIMDE_SSE_NEON)
1522 r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32);
1525 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1526 r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i];
1538 #if defined(SIMDE_SSE_NATIVE)
1539 r.n = _mm_max_pu8(a.n, b.n);
1542 for (
size_t i = 0; i < (
sizeof(r.
u8) /
sizeof(r.
u8[0])); i++) {
1543 r.
u8[i] = (a.
u8[i] > b.
u8[i]) ? a.
u8[i] : b.
u8[i];
1549 #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
1556 #if defined(SIMDE_SSE_NATIVE)
1557 r.n = _mm_max_ss(a.n, b.n);
1558 #elif defined(SIMDE_SSE_NEON)
1559 float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0);
1560 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1562 r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0];
1563 r.f32[1] = a.f32[1];
1564 r.f32[2] = a.f32[2];
1565 r.f32[3] = a.f32[3];
1576 #if defined(SIMDE_SSE_NATIVE)
1577 r.n = _mm_min_pi16(a.n, b.n);
1580 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0])); i++) {
1587 #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
1594 #if defined(SIMDE_SSE_NATIVE)
1595 r.n = _mm_min_ps(a.n, b.n);
1596 #elif defined(SIMDE_SSE_NEON)
1597 r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32);
1600 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1601 r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i];
1613 #if defined(SIMDE_SSE_NATIVE)
1614 r.n = _mm_min_pu8(a.n, b.n);
1617 for (
size_t i = 0; i < (
sizeof(r.
u8) /
sizeof(r.
u8[0])); i++) {
1618 r.
u8[i] = (a.
u8[i] < b.
u8[i]) ? a.
u8[i] : b.
u8[i];
1624 #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
1631 #if defined(SIMDE_SSE_NATIVE)
1632 r.n = _mm_min_ss(a.n, b.n);
1633 #elif defined(SIMDE_SSE_NEON)
1634 float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0);
1635 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1637 r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0];
1638 r.f32[1] = a.f32[1];
1639 r.f32[2] = a.f32[2];
1640 r.f32[3] = a.f32[3];
1651 #if defined(SIMDE_SSE_NATIVE)
1652 r.n = _mm_move_ss(a.n, b.n);
1654 r.f32[0] = b.f32[0];
1655 r.f32[1] = a.f32[1];
1656 r.f32[2] = a.f32[2];
1657 r.f32[3] = a.f32[3];
1668 #if defined(SIMDE_SSE_NATIVE)
1669 r.n = _mm_movehl_ps(a.n, b.n);
1671 r.f32[0] = b.f32[2];
1672 r.f32[1] = b.f32[3];
1673 r.f32[2] = a.f32[2];
1674 r.f32[3] = a.f32[3];
1685 #if defined(SIMDE_SSE_NATIVE)
1686 r.n = _mm_movelh_ps(a.n, b.n);
1688 r.f32[0] = a.f32[0];
1689 r.f32[1] = a.f32[1];
1690 r.f32[2] = b.f32[0];
1691 r.f32[3] = b.f32[1];
1700 #if defined(SIMDE_SSE_NATIVE)
1701 return _mm_movemask_pi8(a.n);
1704 const size_t nmemb =
sizeof(a.
i8) /
sizeof(a.
i8[0]);
1707 for (
size_t i = 0; i < nmemb; i++) {
1708 r |= (a.
u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
1714 #define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b)
1719 #if defined(SIMDE_SSE_NATIVE)
1720 return _mm_movemask_ps(a.n);
1721 #elif defined(SIMDE_SSE_NEON)
1723 static const uint32x4_t movemask = {1, 2, 4, 8};
1724 static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
1726 uint32x4_t t0 = a.neon_u32;
1727 uint32x4_t t1 = vtstq_u32(t0, highbit);
1728 uint32x4_t t2 = vandq_u32(t1, movemask);
1729 uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
1730 return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
1735 for (
size_t i = 0; i <
sizeof(a.u32) /
sizeof(a.u32[0]); i++) {
1736 r |= (a.u32[i] >> ((
sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i;
1748 #if defined(SIMDE_SSE_NATIVE)
1749 r.n = _mm_mul_ps(a.n, b.n);
1750 #elif defined(SIMDE_SSE_NEON)
1751 r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32);
1754 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1755 r.f32[i] = a.f32[i] * b.f32[i];
1767 #if defined(SIMDE_SSE_NATIVE)
1768 r.n = _mm_mul_ss(a.n, b.n);
1770 r.f32[0] = a.f32[0] * b.f32[0];
1771 r.f32[1] = a.f32[1];
1772 r.f32[2] = a.f32[2];
1773 r.f32[3] = a.f32[3];
1784 #if defined(SIMDE_SSE_NATIVE)
1785 r.n = _mm_mulhi_pu16(a.n, b.n);
1788 for (
size_t i = 0; i < (
sizeof(r.
u16) /
sizeof(r.
u16[0])); i++) {
1795 #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
1802 #if defined(SIMDE_SSE_NATIVE)
1803 r.n = _mm_or_ps(a.n, b.n);
1804 #elif defined(SIMDE_SSE_NEON)
1805 r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
1808 for (
size_t i = 0; i < (
sizeof(r.u32) /
sizeof(r.u32[0])); i++) {
1809 r.u32[i] = a.u32[i] | b.u32[i];
1822 #if defined(SIMDE_SSE_NATIVE)
1823 #define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
1831 #if defined(SIMDE_SSE_NATIVE)
1832 r.n = _mm_rcp_ps(a.n);
1833 #elif defined(SIMDE_SSE_NEON)
1834 float32x4_t recip = vrecpeq_f32(a.neon_f32);
1836 #if !defined(SIMDE_MM_RCP_PS_ITERS)
1837 #define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS
1840 for (
int i = 0; i < SIMDE_MM_RCP_PS_ITERS; ++i) {
1841 recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32));
1847 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1848 r.f32[i] = 1.0f / a.f32[i];
1860 #if defined(SIMDE_SSE_NATIVE)
1861 r.n = _mm_rcp_ss(a.n);
1863 r.f32[0] = 1.0f / a.f32[0];
1864 r.f32[1] = a.f32[1];
1865 r.f32[2] = a.f32[2];
1866 r.f32[3] = a.f32[3];
1877 #if defined(SIMDE_SSE_NATIVE)
1878 r.n = _mm_rsqrt_ps(a.n);
1879 #elif defined(SIMDE_SSE_NEON)
1880 r.neon_f32 = vrsqrteq_f32(a.neon_f32);
1881 #elif defined(__STDC_IEC_559__)
1884 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1885 r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1);
1887 #if SIMDE_ACCURACY_ITERS > 2
1891 (
half * r.f32[i] * r.f32[i]);
1896 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1897 r.f32[i] = 1.0f / sqrtf(a.f32[i]);
1909 #if defined(SIMDE_SSE_NATIVE)
1910 r.n = _mm_rsqrt_ss(a.n);
1911 #elif defined(__STDC_IEC_559__)
1913 r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1);
1915 #if SIMDE_ACCURACY_ITERS > 2
1919 (
half * r.f32[0] * r.f32[0]);
1922 r.f32[0] = 1.0f / sqrtf(a.f32[0]);
1923 r.f32[1] = a.f32[1];
1924 r.f32[2] = a.f32[2];
1925 r.f32[3] = a.f32[3];
1927 r.f32[0] = 1.0f / sqrtf(a.f32[0]);
1928 r.f32[1] = a.f32[1];
1929 r.f32[2] = a.f32[2];
1930 r.f32[3] = a.f32[3];
1941 #if defined(SIMDE_SSE_NATIVE)
1942 r.n = _mm_sad_pu8(a.n, b.n);
1947 for (
size_t i = 0; i < (
sizeof(r.
u8) /
sizeof(r.
u8[0])); i++) {
1948 sum += (uint8_t)abs(a.
u8[i] - b.
u8[i]);
1959 #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
1967 #if defined(SIMDE_SSE_NATIVE)
1968 r.n = _mm_set_ps(e3, e2, e1, e0);
1969 #elif defined(SIMDE_SSE_NEON)
1971 r.neon_f32 = vld1q_f32(data);
1987 #if defined(SIMDE_SSE_NATIVE)
1988 r.n = _mm_set1_ps(a);
1989 #elif defined(SIMDE_SSE_NEON)
1990 r.neon_f32 = vdupq_n_f32(a);
1997 #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
2004 #if defined(SIMDE_SSE_NATIVE)
2005 r.n = _mm_set_ss(a);
2019 #if defined(SIMDE_SSE_NATIVE)
2020 r.n = _mm_setr_ps(e3, e2, e1, e0);
2021 #elif defined(SIMDE_SSE_NEON)
2023 r.neon_f32 = vld1q_f32(data);
2036 #if defined(SIMDE_SSE_NATIVE)
2037 r.n = _mm_setzero_ps();
2038 #elif defined(SIMDE_SSE_NEON)
2039 r.neon_f32 = vdupq_n_f32(0.0f);
2051 #if defined(SIMDE_SSE_NATIVE)
2053 #elif defined(__GNUC__) && \
2054 ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2055 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2056 #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
2057 (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
2058 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
2059 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2061 atomic_thread_fence(memory_order_seq_cst);
2063 #elif defined(_MSC_VER)
2065 #elif defined(__GNUC__) && \
2066 ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2067 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2068 #elif HEDLEY_CLANG_HAS_FEATURE(c_atomic)
2069 __c11_atomic_thread_fence(__ATOMIC_SEQ_CST)
2070 #elif defined(__GNUC__) && \
2071 ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
2072 __sync_synchronize();
2073 #elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || \
2074 (defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140))
2075 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2076 #elif defined(_OPENMP)
2077 #pragma omp critical(simde_mm_sfence_)
2083 #define SIMDE_MM_SHUFFLE(z, y, x, w) \
2084 (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2090 for (
size_t i = 0; i <
sizeof(r.
u16) /
sizeof(r.
u16[0]); i++) {
2091 r.
i16[i] = a.
i16[(imm8 >> (i * 2)) & 3];
2095 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2096 #define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_C(_mm_shuffle_pi16(a.n, imm8))
2097 #elif defined(SIMDE__SHUFFLE_VECTOR)
2098 #define simde_mm_shuffle_pi16(a, imm8) \
2100 const simde__m64 simde__tmp_a_ = a; \
2101 (simde__m64){.i16 = SIMDE__SHUFFLE_VECTOR( \
2102 16, 8, (simde__tmp_a_).i16, \
2103 (simde__tmp_a_).i16, (((imm8)) & 3), \
2104 (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
2105 (((imm8) >> 6) & 3))}; \
2109 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2110 #define simde_m_pshufw(a, imm8) SIMDE__M64_C(_m_pshufw(a.n, imm8))
2112 #define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2119 r.f32[0] = a.f32[(imm8 >> 0) & 3];
2120 r.f32[1] = a.f32[(imm8 >> 2) & 3];
2121 r.f32[2] = b.f32[(imm8 >> 4) & 3];
2122 r.f32[3] = b.f32[(imm8 >> 6) & 3];
2125 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2126 #define simde_mm_shuffle_ps(a, b, imm8) \
2127 SIMDE__M128_C(_mm_shuffle_ps(a.n, b.n, imm8))
2128 #elif defined(SIMDE__SHUFFLE_VECTOR)
2129 #define simde_mm_shuffle_ps(a, b, imm8) \
2131 (simde__m128){.f32 = SIMDE__SHUFFLE_VECTOR( \
2132 32, 16, (a).f32, (b).f32, \
2133 (((imm8)) & 3), (((imm8) >> 2) & 3), \
2134 (((imm8) >> 4) & 3) + 4, \
2135 (((imm8) >> 6) & 3) + 4)}; \
2144 #if defined(SIMDE_SSE_NATIVE)
2145 r.n = _mm_sqrt_ps(a.n);
2146 #elif defined(SIMDE_SSE_NEON)
2147 float32x4_t recipsq = vrsqrteq_f32(a.neon_f32);
2148 float32x4_t sq = vrecpeq_f32(recipsq);
2153 for (
size_t i = 0; i <
sizeof(r.f32) /
sizeof(r.f32[0]); i++) {
2154 r.f32[i] = sqrtf(a.f32[i]);
2166 #if defined(SIMDE_SSE_NATIVE)
2167 r.n = _mm_sqrt_ss(a.n);
2168 #elif defined(SIMDE_SSE_NEON)
2170 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
2172 r.f32[0] = sqrtf(a.f32[0]);
2173 r.f32[1] = a.f32[1];
2174 r.f32[2] = a.f32[2];
2175 r.f32[3] = a.f32[3];
2186 #if defined(SIMDE_SSE_NATIVE)
2187 _mm_store_ps(mem_addr, a.n);
2188 #elif defined(SIMDE_SSE_NEON)
2189 vst1q_f32(mem_addr, a.neon_f32);
2192 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2193 mem_addr[i] = a.f32[i];
2203 #if defined(SIMDE_SSE_NATIVE)
2204 _mm_store_ps1(mem_addr, a.n);
2207 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2208 mem_addr[i] = a.f32[0];
2216 #if defined(SIMDE_SSE_NATIVE)
2217 _mm_store_ss(mem_addr, a.n);
2218 #elif defined(SIMDE_SSE_NEON)
2219 vst1q_lane_f32(mem_addr, a.neon_f32, 0);
2221 *mem_addr = a.f32[0];
2230 #if defined(SIMDE_SSE_NATIVE)
2231 _mm_store1_ps(mem_addr, a.n);
2240 #if defined(SIMDE_SSE_NATIVE)
2241 _mm_storeh_pi(&(mem_addr->n), a.n);
2243 mem_addr->
f32[0] = a.f32[2];
2244 mem_addr->
f32[1] = a.f32[3];
2251 #if defined(SIMDE_SSE_NATIVE)
2252 _mm_storel_pi(&(mem_addr->n), a.n);
2254 mem_addr->
f32[0] = a.f32[0];
2255 mem_addr->
f32[1] = a.f32[1];
2264 #if defined(SIMDE_SSE_NATIVE)
2265 _mm_storer_ps(mem_addr, a.n);
2268 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2270 a.f32[((
sizeof(a.f32) /
sizeof(a.f32[0])) - 1) - i];
2278 #if defined(SIMDE_SSE_NATIVE)
2279 _mm_storeu_ps(mem_addr, a.n);
2280 #elif defined(SIMDE_SSE_NEON)
2281 vst1q_f32(mem_addr, a.neon_f32);
2284 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2285 mem_addr[i] = a.f32[i];
2295 #if defined(SIMDE_SSE_NATIVE)
2296 r.n = _mm_sub_ps(a.n, b.n);
2297 #elif defined(SIMDE_SSE_NEON)
2298 r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32);
2301 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
2302 r.f32[i] = a.f32[i] - b.f32[i];
2314 #if defined(SIMDE_SSE_NATIVE)
2315 r.n = _mm_sub_ss(a.n, b.n);
2317 r.f32[0] = a.f32[0] - b.f32[0];
2318 r.f32[1] = a.f32[1];
2319 r.f32[2] = a.f32[2];
2320 r.f32[3] = a.f32[3];
2329 #if defined(SIMDE_SSE_NATIVE)
2330 return _mm_ucomieq_ss(a.n, b.n);
2333 int x = feholdexcept(&envp);
2334 int r = a.f32[0] == b.f32[0];
2344 #if defined(SIMDE_SSE_NATIVE)
2345 return _mm_ucomige_ss(a.n, b.n);
2348 int x = feholdexcept(&envp);
2349 int r = a.f32[0] >= b.f32[0];
2359 #if defined(SIMDE_SSE_NATIVE)
2360 return _mm_ucomigt_ss(a.n, b.n);
2363 int x = feholdexcept(&envp);
2364 int r = a.f32[0] > b.f32[0];
2374 #if defined(SIMDE_SSE_NATIVE)
2375 return _mm_ucomile_ss(a.n, b.n);
2378 int x = feholdexcept(&envp);
2379 int r = a.f32[0] <= b.f32[0];
2389 #if defined(SIMDE_SSE_NATIVE)
2390 return _mm_ucomilt_ss(a.n, b.n);
2393 int x = feholdexcept(&envp);
2394 int r = a.f32[0] < b.f32[0];
2404 #if defined(SIMDE_SSE_NATIVE)
2405 return _mm_ucomineq_ss(a.n, b.n);
2408 int x = feholdexcept(&envp);
2409 int r = a.f32[0] != b.f32[0];
2416 #if defined(SIMDE_SSE_NATIVE)
2417 #if defined(__has_builtin)
2418 #if __has_builtin(__builtin_ia32_undef128)
2419 #define SIMDE__HAVE_UNDEFINED128
2421 #elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793)
2422 #define SIMDE__HAVE_UNDEFINED128
2431 #if defined(SIMDE__HAVE_UNDEFINED128)
2432 r.n = _mm_undefined_ps();
2445 #if defined(SIMDE_SSE_NATIVE)
2446 r.n = _mm_unpackhi_ps(a.n, b.n);
2447 #elif defined(SIMDE_SSE_NEON)
2448 float32x2_t a1 = vget_high_f32(a.neon_f32);
2449 float32x2_t b1 = vget_high_f32(b.neon_f32);
2450 float32x2x2_t result = vzip_f32(a1, b1);
2451 r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
2453 r.f32[0] = a.f32[2];
2454 r.f32[1] = b.f32[2];
2455 r.f32[2] = a.f32[3];
2456 r.f32[3] = b.f32[3];
2467 #if defined(SIMDE_SSE_NATIVE)
2468 r.n = _mm_unpacklo_ps(a.n, b.n);
2469 #elif defined(SIMDE_SSE_NEON)
2470 float32x2_t a1 = vget_low_f32(a.neon_f32);
2471 float32x2_t b1 = vget_low_f32(b.neon_f32);
2472 float32x2x2_t result = vzip_f32(a1, b1);
2473 r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
2475 r.f32[0] = a.f32[0];
2476 r.f32[1] = b.f32[0];
2477 r.f32[2] = a.f32[1];
2478 r.f32[3] = b.f32[1];
2489 #if defined(SIMDE_SSE_NATIVE)
2490 r.n = _mm_xor_ps(a.n, b.n);
2491 #elif defined(SIMDE_SSE_NEON)
2492 r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32);
2495 for (
size_t i = 0; i < (
sizeof(r.u32) /
sizeof(r.u32[0])); i++) {
2496 r.u32[i] = a.u32[i] ^ b.u32[i];
2506 #if defined(SIMDE_SSE_NATIVE)
2507 _mm_stream_pi(&(mem_addr->n), a.n);
2509 mem_addr->
i64[0] = a.
i64[0];
2518 #if defined(SIMDE_SSE_NATIVE)
2519 _mm_stream_ps(mem_addr, a.n);
2522 memcpy(mem_addr, &a,
sizeof(a));
2529 #if defined(SIMDE_SSE_NATIVE)
2530 return _mm_getcsr();
2533 int rounding_mode = fegetround();
2535 switch (rounding_mode) {
2556 #if defined(SIMDE_SSE_NATIVE)
2559 switch ((a >> 13) & 3) {
2561 fesetround(FE_TONEAREST);
2564 fesetround(FE_DOWNWARD);
2567 fesetround(FE_UPWARD);
2570 fesetround(FE_TOWARDZERO);
2576 #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2578 simde__m128 tmp3, tmp2, tmp1, tmp0; \
2579 tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
2580 tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
2581 tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
2582 tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
2583 row0 = simde_mm_movelh_ps(tmp0, tmp2); \
2584 row1 = simde_mm_movehl_ps(tmp2, tmp0); \
2585 row2 = simde_mm_movelh_ps(tmp1, tmp3); \
2586 row3 = simde_mm_movehl_ps(tmp3, tmp1); \