30 #if !defined(SIMDE__SSE2_H)
31 #if !defined(SIMDE__SSE2_H)
36 #if defined(SIMDE_SSE2_NATIVE)
37 #undef SIMDE_SSE2_NATIVE
39 #if defined(SIMDE_SSE2_FORCE_NATIVE)
40 #define SIMDE_SSE2_NATIVE
41 #elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \
42 !defined(SIMDE_NO_NATIVE)
43 #define SIMDE_SSE2_NATIVE
44 #elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \
45 !defined(SIMDE_NO_NEON)
46 #define SIMDE_SSE2_NEON
49 #if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE)
50 #if defined(SIMDE_SSE2_FORCE_NATIVE)
51 #error Native SSE2 support requires native SSE support
53 #warning Native SSE2 support requires native SSE support, disabling
54 #undef SIMDE_SSE2_NATIVE
56 #elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON)
57 #warning SSE2 NEON support requires SSE NEON support, disabling
61 #if defined(SIMDE_SSE2_NATIVE)
62 #include <emmintrin.h>
64 #if defined(SIMDE_SSE2_NEON)
73 #define vreinterpretq_m128i_s32(v) \
74 (simde__m128i) { .neon_i32 = v }
75 #define vreinterpretq_m128i_u64(v) \
76 (simde__m128i) { .neon_u64 = v }
78 #define vreinterpretq_s32_m128i(a) a.neon_i32
79 #define vreinterpretq_u64_m128i(a) a.neon_u64
84 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
85 int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
86 int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
87 int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
88 int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
89 uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
90 uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
91 uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
92 uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
93 #if defined(SIMDE__HAVE_INT128)
94 simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
95 simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
97 simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
98 simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
108 #if defined(SIMDE__HAVE_INT128)
109 simde_int128 i128[1];
110 simde_uint128 u128[1];
116 #if defined(SIMDE_SSE2_NATIVE)
118 #elif defined(SIMDE_SSE2_NEON)
127 float32x4_t neon_f32;
128 #if defined(SIMDE_ARCH_AMD64)
129 float64x2_t neon_f64;
135 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
136 int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
137 int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
138 int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
139 int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
140 uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
141 uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
142 uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
143 uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
144 simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
145 simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
159 #if defined(SIMDE_SSE2_NATIVE)
161 #elif defined(SIMDE_SSE2_NEON)
170 float32x4_t neon_f32;
171 #if defined(SIMDE_ARCH_AMD64)
172 float64x2_t neon_f64;
177 #if defined(SIMDE_SSE2_NATIVE)
179 "__m128i size doesn't match simde__m128i size");
181 "__m128d size doesn't match simde__m128d size");
194 #elif defined(SIMDE_SSE_NEON)
195 #define SIMDE__M128I_NEON_C(T, expr) \
196 (simde__m128i) { .neon_##T = expr }
197 #define SIMDE__M128D_NEON_C(T, expr) \
198 (simde__m128d) { .neon_##T = expr }
206 #if defined(SIMDE_SSE2_NATIVE)
207 return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
208 #elif defined(SIMDE_SSE2_NEON)
209 return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
213 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
214 r.i8[i] = a.i8[i] + b.i8[i];
223 #if defined(SIMDE_SSE2_NATIVE)
224 return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
225 #elif defined(SIMDE_SSE2_NEON)
226 return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
230 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
231 r.i16[i] = a.i16[i] + b.i16[i];
240 #if defined(SIMDE_SSE2_NATIVE)
241 return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
242 #elif defined(SIMDE_SSE2_NEON)
243 return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
247 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
248 r.i32[i] = a.i32[i] + b.i32[i];
257 #if defined(SIMDE_SSE2_NATIVE)
258 return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
259 #elif defined(SIMDE_SSE2_NEON)
260 return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
264 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
265 r.i64[i] = a.i64[i] + b.i64[i];
274 #if defined(SIMDE_SSE2_NATIVE)
275 return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
276 #elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64)
277 return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
281 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
282 r.f64[i] = a.f64[i] + b.f64[i];
291 #if defined(SIMDE_SSE2_NATIVE)
292 return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
295 r.f64[0] = a.f64[0] + b.f64[0];
304 #if defined(SIMDE_SSE2_NATIVE)
305 return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
306 #elif defined(SIMDE_SSE2_NEON)
307 return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
318 #if defined(SIMDE_SSE2_NATIVE)
319 return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
320 #elif defined(SIMDE_SSE2_NEON)
321 return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
325 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
326 if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
328 }
else if ((((b.i8[i]) < 0) &&
329 ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
332 r.i8[i] = (a.i8[i]) + (b.i8[i]);
342 #if defined(SIMDE_SSE2_NATIVE)
343 return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
344 #elif defined(SIMDE_SSE2_NEON)
345 return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
349 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
350 if ((((b.i16[i]) > 0) &&
351 ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
352 r.i16[i] = INT16_MAX;
353 }
else if ((((b.i16[i]) < 0) &&
354 ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
355 r.i16[i] = INT16_MIN;
357 r.i16[i] = (a.i16[i]) + (b.i16[i]);
367 #if defined(SIMDE_SSE2_NATIVE)
368 return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
369 #elif defined(SIMDE_SSE2_NEON)
370 return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
374 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
375 r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
376 ? (a.u8[i] + b.u8[i])
386 #if defined(SIMDE_SSE2_NATIVE)
387 return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
388 #elif defined(SIMDE_SSE2_NEON)
389 return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
393 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
394 r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
395 ? (a.u16[i] + b.u16[i])
405 #if defined(SIMDE_SSE2_NATIVE)
406 return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
407 #elif defined(SIMDE_SSE2_NEON)
408 return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
412 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
413 r.u64[i] = a.u64[i] & b.u64[i];
422 #if defined(SIMDE_SSE2_NATIVE)
423 return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
424 #elif defined(SIMDE_SSE_NEON)
425 return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
429 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
430 r.i64[i] = a.i64[i] & b.i64[i];
439 #if defined(SIMDE_SSE2_NATIVE)
440 return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
441 #elif defined(SIMDE_SSE2_NEON)
442 return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
446 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
447 r.u64[i] = ~a.u64[i] & b.u64[i];
456 #if defined(SIMDE_SSE2_NATIVE)
457 return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
458 #elif defined(SIMDE_SSE2_NEON)
459 return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
463 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
464 r.i64[i] = ~(a.i64[i]) & b.i64[i];
473 #if defined(SIMDE_SSE2_NATIVE)
474 return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
475 #elif defined(SIMDE_SSE2_NEON)
476 return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
480 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
481 r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
490 #if defined(SIMDE_SSE2_NATIVE)
491 return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
492 #elif defined(SIMDE_SSE2_NEON)
493 return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
497 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
498 r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
515 const int s = imm8 * 8;
517 #if defined(SIMDE__HAVE_INT128)
518 r.u128[0] = a.u128[0] << s;
521 r.u64[0] = (a.u64[0] << s);
522 r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
525 r.u64[1] = a.u64[0] << (s - 64);
531 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
532 #define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8))
533 #elif defined(SIMDE_SSE2_NEON)
534 #define simde_mm_bslli_si128(a, imm8) \
535 SIMDE__M128I_NEON_C( \
537 (((imm8) <= 0) ? ((a).neon_i8) \
538 : (((imm8) > 15) ? (vdupq_n_s8(0)) \
539 : (vextq_s8(vdupq_n_s8(0), \
543 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
556 const int s = imm8 * 8;
558 #if defined(SIMDE__HAVE_INT128)
559 r.u128[0] = a.u128[0] >> s;
562 r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
563 r.u64[1] = (a.u64[1] >> s);
565 r.u64[0] = a.u64[1] >> (s - 64);
572 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
573 #define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8))
574 #elif defined(SIMDE_SSE2_NEON)
575 #define simde_mm_bsrli_si128(a, imm8) \
576 SIMDE__M128I_NEON_C( \
580 : (((imm8) > 15) ? (vdupq_n_s8(0)) \
581 : (vextq_s8((a).neon_i8, \
582 vdupq_n_s8(0), (imm8)))))
584 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8)
589 #if defined(SIMDE_SSE2_NATIVE)
599 #if defined(SIMDE_SSE2_NATIVE)
600 return _mm_comieq_sd(a.n, b.n);
602 return a.f64[0] == b.f64[0];
609 #if defined(SIMDE_SSE2_NATIVE)
610 return _mm_comige_sd(a.n, b.n);
612 return a.f64[0] >= b.f64[0];
619 #if defined(SIMDE_SSE2_NATIVE)
620 return _mm_comigt_sd(a.n, b.n);
622 return a.f64[0] > b.f64[0];
629 #if defined(SIMDE_SSE2_NATIVE)
630 return _mm_comile_sd(a.n, b.n);
632 return a.f64[0] <= b.f64[0];
639 #if defined(SIMDE_SSE2_NATIVE)
640 return _mm_comilt_sd(a.n, b.n);
642 return a.f64[0] < b.f64[0];
649 #if defined(SIMDE_SSE2_NATIVE)
650 return _mm_comineq_sd(a.n, b.n);
652 return a.f64[0] != b.f64[0];
659 #if defined(SIMDE_SSE2_NATIVE)
660 return SIMDE__M128_C(_mm_castpd_ps(a.n));
674 #if defined(SIMDE_SSE2_NATIVE)
675 return SIMDE__M128I_C(_mm_castpd_si128(a.n));
689 #if defined(SIMDE_SSE2_NATIVE)
690 return SIMDE__M128D_C(_mm_castps_pd(a.n));
704 #if defined(SIMDE_SSE2_NATIVE)
705 return SIMDE__M128I_C(_mm_castps_si128(a.n));
706 #elif defined(SIMDE_SSE2_NEON)
707 return SIMDE__M128I_NEON_C(i32, a.neon_i32);
721 #if defined(SIMDE_SSE2_NATIVE)
722 return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
736 #if defined(SIMDE_SSE2_NATIVE)
737 return SIMDE__M128_C(_mm_castsi128_ps(a.n));
738 #elif defined(SIMDE_SSE2_NEON)
739 return SIMDE__M128_NEON_C(f32, a.neon_f32);
753 #if defined(SIMDE_SSE2_NATIVE)
754 return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
755 #elif defined(SIMDE_SSE2_NEON)
756 return SIMDE__M128I_NEON_C(
757 i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
761 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
762 r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
771 #if defined(SIMDE_SSE2_NATIVE)
772 return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
773 #elif defined(SIMDE_SSE2_NEON)
774 return SIMDE__M128I_NEON_C(
775 i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
779 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
780 r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
789 #if defined(SIMDE_SSE2_NATIVE)
790 return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
791 #elif defined(SIMDE_SSE2_NEON)
792 return SIMDE__M128I_NEON_C(
793 i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
797 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
798 r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
807 #if defined(SIMDE_SSE2_NATIVE)
808 return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
809 #elif defined(SIMDE_SSE2_NEON)
810 return SIMDE__M128D_NEON_C(
811 i32, vreinterpretq_s32_u32(
812 vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
813 vreinterpretq_s32_f32(a.neon_f32))));
817 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
818 r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
827 #if defined(SIMDE_SSE2_NATIVE)
828 return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
831 r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
840 #if defined(SIMDE_SSE2_NATIVE)
841 return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
842 #elif defined(SIMDE_SSE2_NEON)
843 return SIMDE__M128D_NEON_C(f32,
844 vreinterpretq_f32_u16(vmvnq_u16(
845 vceqq_s16(b.neon_i16, a.neon_i16))));
849 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
850 r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
859 #if defined(SIMDE_SSE2_NATIVE)
860 return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
863 r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
872 #if defined(SIMDE_SSE2_NATIVE)
873 return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
874 #elif defined(SIMDE_SSE2_NEON)
875 return SIMDE__M128I_NEON_C(
876 i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
880 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
881 r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
890 #if defined(SIMDE_SSE2_NATIVE)
891 return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
892 #elif defined(SIMDE_SSE2_NEON)
893 return SIMDE__M128I_NEON_C(
894 i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
898 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
899 r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
908 #if defined(SIMDE_SSE2_NATIVE)
909 return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
910 #elif defined(SIMDE_SSE2_NEON)
911 return SIMDE__M128I_NEON_C(
912 i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
916 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
917 r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
926 #if defined(SIMDE_SSE2_NATIVE)
927 return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
931 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
932 r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
941 #if defined(SIMDE_SSE2_NATIVE)
942 return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
945 r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
954 #if defined(SIMDE_SSE2_NATIVE)
955 return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
959 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
960 r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
969 #if defined(SIMDE_SSE2_NATIVE)
970 return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
973 r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
982 #if defined(SIMDE_SSE2_NATIVE)
983 return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
984 #elif defined(SIMDE_SSE2_NEON)
985 return SIMDE__M128I_NEON_C(
986 i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
990 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
991 r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
1000 #if defined(SIMDE_SSE2_NATIVE)
1001 return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
1002 #elif defined(SIMDE_SSE2_NEON)
1003 return SIMDE__M128I_NEON_C(
1004 i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
1008 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
1009 r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
1018 #if defined(SIMDE_SSE2_NATIVE)
1019 return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
1020 #elif defined(SIMDE_SSE2_NEON)
1021 return SIMDE__M128I_NEON_C(
1022 i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
1026 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1027 r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
1036 #if defined(SIMDE_SSE2_NATIVE)
1037 return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
1041 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1042 r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1051 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1052 return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
1055 r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1056 r.u64[1] = a.u64[1];
1064 #if defined(SIMDE_SSE2_NATIVE)
1065 return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
1069 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1070 r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1079 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1080 return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
1083 r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1084 r.u64[1] = a.u64[1];
1092 #if defined(SIMDE_SSE2_NATIVE)
1093 return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
1102 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1103 return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
1112 #if defined(SIMDE_SSE2_NATIVE)
1113 return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
1122 #if defined(SIMDE_SSE2_NATIVE)
1123 return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
1132 #if defined(SIMDE_SSE2_NATIVE)
1133 return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
1142 #if defined(SIMDE_SSE2_NATIVE)
1143 return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
1152 #if defined(SIMDE_SSE2_NATIVE)
1153 return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
1157 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1158 r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
1168 #if defined(SIMDE_SSE2_NATIVE)
1169 return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
1172 r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
1174 r.u64[1] = a.u64[1];
1182 #if defined(SIMDE_SSE2_NATIVE)
1183 return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
1187 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1188 r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
1198 #if defined(SIMDE_SSE2_NATIVE)
1199 return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
1202 r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
1204 r.u64[1] = a.u64[1];
1212 #if defined(SIMDE_SSE2_NATIVE)
1213 return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
1217 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1227 #if defined(SIMDE_SSE2_NATIVE)
1228 return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
1229 #elif defined(SIMDE_SSE2_NEON)
1230 return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
1234 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1244 #if defined(SIMDE_SSE2_NATIVE)
1245 return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
1249 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1250 r.i32[i] = (int32_t)a.f64[i];
1259 #if defined(SIMDE_SSE2_NATIVE)
1260 return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
1264 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
1265 r.
i32[i] = (int32_t)a.f64[i];
1274 #if defined(SIMDE_SSE2_NATIVE)
1275 return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
1279 for (
size_t i = 0; i < (
sizeof(a.f64) /
sizeof(a.f64[0])); i++) {
1289 #if defined(SIMDE_SSE2_NATIVE)
1290 return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
1294 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1304 #if defined(SIMDE_SSE2_NATIVE)
1305 return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
1306 #elif defined(SIMDE_SSE2_NEON)
1309 #if defined(SIMDE_ARCH_AARCH64)
1310 return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
1312 uint32x4_t signmask = vdupq_n_u32(0x80000000);
1313 float32x4_t
half = vbslq_f32(signmask, a.neon_f32,
1315 int32x4_t r_normal = vcvtq_s32_f32(
1316 vaddq_f32(a.neon_f32,
half));
1318 vcvtq_s32_f32(a.neon_f32);
1319 int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31);
1320 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1322 float32x4_t delta = vsubq_f32(
1324 vcvtq_f32_s32(r_trunc));
1325 uint32x4_t is_delta_half =
1326 vceqq_f32(delta,
half);
1327 return SIMDE__M128I_NEON_C(i32,
1328 vbslq_s32(is_delta_half, r_even, r_normal));
1333 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1334 r.i32[i] = (int32_t)a.f32[i];
1343 #if defined(SIMDE_SSE2_NATIVE)
1344 return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
1348 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1349 r.f64[i] = a.f32[i];
1358 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1359 return _mm_cvtsd_f64(a.n);
1368 #if defined(SIMDE_SSE2_NATIVE)
1369 return _mm_cvtsd_si32(a.n);
1371 return (int32_t)a.f64[0];
1378 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1380 return _mm_cvtsd_si64x(a.n);
1382 return _mm_cvtsd_si64(a.n);
1385 return (int32_t)a.f64[0];
1388 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
1393 #if defined(SIMDE_SSE2_NATIVE)
1394 return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
1401 for (
size_t i = 1; i < (
sizeof(r) /
sizeof(r.i32[0])); i++) {
1402 r.i32[i] = a.i32[i];
1412 #if defined(SIMDE_SSE2_NATIVE)
1413 return _mm_cvtsi128_si32(a.n);
1414 #elif defined(SIMDE_SSE2_NEON)
1415 return vgetq_lane_s32(a.neon_i32, 0);
1424 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1426 return _mm_cvtsi128_si64x(a.n);
1428 return _mm_cvtsi128_si64(a.n);
1434 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
1439 #if defined(SIMDE_SSE2_NATIVE)
1440 return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
1445 r.i64[1] = a.i64[1];
1456 #if defined(SIMDE_SSE2_NATIVE)
1457 r.n = _mm_cvtsi32_si128(a);
1458 #elif defined(SIMDE_SSE2_NEON)
1459 r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
1475 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1477 r.n = _mm_cvtsi64_sd(a.n, b);
1479 r.n = _mm_cvtsi64x_sd(a.n, b);
1483 r.f64[1] = a.f64[1];
1488 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b)
1495 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1497 r.n = _mm_cvtsi64_si128(a);
1499 r.n = _mm_cvtsi64x_si128(a);
1508 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
1515 #if defined(SIMDE_SSE2_NATIVE)
1516 r.n = _mm_cvtss_sd(a.n, b.n);
1518 r.f64[0] = b.f32[0];
1519 r.i64[1] = a.i64[1];
1530 #if defined(SIMDE_SSE2_NATIVE)
1531 r.n = _mm_cvttpd_epi32(a.n);
1533 for (
size_t i = 0; i < (
sizeof(a.f64) /
sizeof(a.f64[0])); i++) {
1534 r.i32[i] = (int32_t)trunc(a.f64[i]);
1546 #if defined(SIMDE_SSE2_NATIVE)
1547 r.n = _mm_cvttpd_pi32(a.n);
1549 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
1550 r.
i32[i] = (int32_t)trunc(a.f64[i]);
1562 #if defined(SIMDE_SSE2_NATIVE)
1563 r.n = _mm_cvttps_epi32(a.n);
1564 #elif defined(SIMDE_SSE2_NEON)
1565 r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
1567 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1568 r.i32[i] = (int32_t)truncf(a.f32[i]);
1578 #if defined(SIMDE_SSE2_NATIVE)
1579 return _mm_cvttsd_si32(a.n);
1581 return (int32_t)trunc(a.f64[0]);
1588 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1590 return _mm_cvttsd_si64(a.n);
1592 return _mm_cvttsd_si64x(a.n);
1595 return (int64_t)trunc(a.f64[0]);
1598 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
1605 #if defined(SIMDE_SSE2_NATIVE)
1606 r.n = _mm_div_pd(a.n, b.n);
1609 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1610 r.f64[i] = a.f64[i] / b.f64[i];
1622 #if defined(SIMDE_SSE2_NATIVE)
1623 r.n = _mm_div_sd(a.n, b.n);
1625 r.f64[0] = a.f64[0] / b.f64[0];
1626 r.f64[1] = a.f64[1];
1635 return a.u16[imm8 & 7];
1637 #if defined(SIMDE_SSE2_NATIVE) && \
1638 (!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
1639 #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8)
1640 #elif defined(SIMDE_SSE2_NEON)
1641 #define simde_mm_extract_epi16(a, imm8) \
1642 (vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff)))
1648 a.u16[imm8 & 7] = (int16_t)i;
1651 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1652 #define simde_mm_insert_epi16(a, i, imm8) \
1653 SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8)))
1654 #elif defined(SIMDE_SSE2_NEON)
1655 #define simde_mm_insert_epi16(a, i, imm8) \
1656 SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8)))
1667 #if defined(SIMDE_SSE2_NATIVE)
1668 r.n = _mm_load_pd(mem_addr);
1669 #elif defined(SIMDE_SSE2_NEON)
1670 r.neon_u32 = vld1q_u32((uint32_t
const *)mem_addr);
1673 memcpy(&r, mem_addr,
sizeof(r));
1684 #if defined(SIMDE_SSE2_NATIVE)
1685 r.n = _mm_load_pd1(mem_addr);
1687 r.f64[0] = *mem_addr;
1688 r.f64[1] = *mem_addr;
1693 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
1700 #if defined(SIMDE_SSE2_NATIVE)
1701 r.n = _mm_load_sd(mem_addr);
1717 #if defined(SIMDE_SSE2_NATIVE)
1718 r.n = _mm_load_si128(&(mem_addr->n));
1719 #elif defined(SIMDE_SSE2_NEON)
1720 r.neon_i32 = vld1q_s32((int32_t
const *)mem_addr);
1723 memcpy(&r, mem_addr,
sizeof(r));
1734 #if defined(SIMDE_SSE2_NATIVE)
1735 r.n = _mm_loadh_pd(a.n, mem_addr);
1738 memcpy(&t, mem_addr,
sizeof(t));
1739 r.f64[0] = a.f64[0];
1751 #if defined(SIMDE_SSE2_NATIVE)
1752 r.n = _mm_loadl_epi64(&mem_addr->n);
1753 #elif defined(SIMDE_SSE2_NEON)
1754 r.neon_i32 = vcombine_s32(vld1_s32((int32_t
const *)mem_addr),
1757 r.u64[0] = mem_addr->u64[0];
1769 #if defined(SIMDE_SSE2_NATIVE)
1770 r.n = _mm_loadl_pd(a.n, mem_addr);
1773 r.u64[1] = a.u64[1];
1787 #if defined(SIMDE_SSE2_NATIVE)
1788 r.n = _mm_loadr_pd(mem_addr);
1791 r.f64[0] = mem_addr[1];
1792 r.f64[1] = mem_addr[0];
1804 #if defined(SIMDE_SSE2_NATIVE)
1805 r.n = _mm_loadu_pd(mem_addr);
1808 memcpy(&l, &mem_addr[0],
sizeof(l));
1809 memcpy(&h, &mem_addr[1],
sizeof(h));
1822 #if defined(SIMDE_SSE2_NATIVE)
1823 r.n = _mm_loadu_si128(&((*mem_addr).n));
1824 #elif defined(SIMDE_SSE2_NEON)
1825 r.neon_i32 = vld1q_s32((int32_t
const *)mem_addr);
1827 memcpy(&r, mem_addr,
sizeof(r));
1838 #if defined(SIMDE_SSE2_NATIVE)
1839 r.n = _mm_madd_epi16(a.n, b.n);
1840 #elif defined(SIMDE_SSE2_NEON)
1842 vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
1844 vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
1845 int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
1846 int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
1847 r.neon_i32 = vcombine_s32(rl, rh);
1850 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i16[0])); i += 2) {
1852 (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
1863 #if defined(SIMDE_SSE2_NATIVE)
1864 _mm_maskmoveu_si128(a.n, mask.n, (
char *)mem_addr);
1866 for (
size_t i = 0; i < 16; i++) {
1867 if (mask.u8[i] & 0x80) {
1868 mem_addr[i] = a.i8[i];
1877 #if defined(SIMDE_SSE2_NATIVE)
1878 return _mm_movemask_epi8(a.n);
1879 #elif defined(SIMDE_SSE2_NEON)
1880 uint8x16_t input = a.neon_u8;
1882 static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
1883 uint8x8_t mask_and = vdup_n_u8(0x80);
1884 int8x8_t mask_shift = vld1_s8(xr);
1886 uint8x8_t lo = vget_low_u8(input);
1887 uint8x8_t hi = vget_high_u8(input);
1889 lo = vand_u8(lo, mask_and);
1890 lo = vshl_u8(lo, mask_shift);
1892 hi = vand_u8(hi, mask_and);
1893 hi = vshl_u8(hi, mask_shift);
1895 lo = vpadd_u8(lo, lo);
1896 lo = vpadd_u8(lo, lo);
1897 lo = vpadd_u8(lo, lo);
1899 hi = vpadd_u8(hi, hi);
1900 hi = vpadd_u8(hi, hi);
1901 hi = vpadd_u8(hi, hi);
1903 return ((hi[0] << 8) | (lo[0] & 0xFF));
1907 for (
size_t i = 0; i < 16; i++) {
1908 r |= (a.u8[15 - i] >> 7) << (15 - i);
1917 #if defined(SIMDE_SSE2_NATIVE)
1918 return _mm_movemask_pd(a.n);
1922 for (
size_t i = 0; i < (
sizeof(a.u64) /
sizeof(a.u64[0])); i++) {
1923 r |= (a.u64[i] >> 63) << i;
1934 #if defined(SIMDE_SSE2_NATIVE)
1935 r.n = _mm_movepi64_pi64(a.n);
1937 r.
i64[0] = a.i64[0];
1948 #if defined(SIMDE_SSE2_NATIVE)
1949 r.n = _mm_movpi64_epi64(a.n);
1951 r.i64[0] = a.
i64[0];
1963 #if defined(SIMDE_SSE2_NATIVE)
1964 r.n = _mm_min_epi16(a.n, b.n);
1965 #elif defined(SIMDE_SSE2_NEON)
1966 r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
1969 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
1970 r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
1982 #if defined(SIMDE_SSE2_NATIVE)
1983 r.n = _mm_min_epu8(a.n, b.n);
1984 #elif defined(SIMDE_SSE2_NEON)
1985 r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
1988 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
1989 r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
2001 #if defined(SIMDE_SSE2_NATIVE)
2002 r.n = _mm_min_pd(a.n, b.n);
2005 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
2006 r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
2018 #if defined(SIMDE_SSE2_NATIVE)
2019 r.n = _mm_min_sd(a.n, b.n);
2021 r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
2022 r.f64[1] = a.f64[1];
2033 #if defined(SIMDE_SSE2_NATIVE)
2034 r.n = _mm_max_epi16(a.n, b.n);
2035 #elif defined(SIMDE_SSE2_NEON)
2036 r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
2039 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2040 r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
2052 #if defined(SIMDE_SSE2_NATIVE)
2053 r.n = _mm_max_epu8(a.n, b.n);
2054 #elif defined(SIMDE_SSE2_NEON)
2055 r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
2058 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
2059 r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
2071 #if defined(SIMDE_SSE2_NATIVE)
2072 r.n = _mm_max_pd(a.n, b.n);
2075 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
2076 r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
2088 #if defined(SIMDE_SSE2_NATIVE)
2089 r.n = _mm_max_sd(a.n, b.n);
2091 r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
2092 r.f64[1] = a.f64[1];
2103 #if defined(SIMDE_SSE2_NATIVE)
2104 r.n = _mm_move_epi64(a.n);
2105 #elif defined(SIMDE_SSE2_NEON)
2106 r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
2108 r.i64[0] = a.i64[0];
2120 #if defined(SIMDE_SSE2_NATIVE)
2121 r.n = _mm_move_sd(a.n, b.n);
2123 r.f64[0] = b.f64[0];
2124 r.f64[1] = a.f64[1];
2135 #if defined(SIMDE_SSE2_NATIVE)
2136 r.n = _mm_mul_epu32(a.n, b.n);
2139 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
2140 r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
2153 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2154 r.i64[i] = a.i64[i] * b.i64[i];
2166 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2167 r.i64[i] = a.i64[i] % b.i64[i];
2178 #if defined(SIMDE_SSE2_NATIVE)
2179 r.n = _mm_mul_pd(a.n, b.n);
2182 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
2183 r.f64[i] = a.f64[i] * b.f64[i];
2195 #if defined(SIMDE_SSE2_NATIVE)
2196 r.n = _mm_mul_sd(a.n, b.n);
2198 r.f64[0] = a.f64[0] * b.f64[0];
2199 r.f64[1] = a.f64[1];
2210 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
2211 r.n = _mm_mul_su32(a.n, b.n);
2213 r.
u64[0] = ((uint64_t)a.
u32[0]) * ((uint64_t)b.
u32[0]);
2224 #if defined(SIMDE_SSE2_NATIVE)
2225 r.n = _mm_mulhi_epi16(a.n, b.n);
2226 #elif defined(SIMDE_SSE2_NEON)
2227 int16x4_t a3210 = vget_low_s16(a.neon_i16);
2228 int16x4_t b3210 = vget_low_s16(b.neon_i16);
2229 int32x4_t ab3210 = vmull_s16(a3210, b3210);
2230 int16x4_t a7654 = vget_high_s16(a.neon_i16);
2231 int16x4_t b7654 = vget_high_s16(b.neon_i16);
2232 int32x4_t ab7654 = vmull_s16(a7654, b7654);
2233 uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
2234 vreinterpretq_u16_s32(ab7654));
2235 r.neon_u16 = rv.val[1];
2238 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2239 r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
2240 ((int32_t)b.i16[i]))) >>
2253 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
2254 r.n = _mm_mulhi_epu16(a.n, b.n);
2257 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
2258 r.u16[i] = (uint16_t)(
2259 (((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
2271 #if defined(SIMDE_SSE2_NATIVE)
2272 r.n = _mm_mullo_epi16(a.n, b.n);
2273 #elif defined(SIMDE_SSE2_NEON)
2274 r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
2277 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2278 r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
2279 ((int32_t)b.i16[i]))) &
2292 #if defined(SIMDE_SSE2_NATIVE)
2293 r.n = _mm_or_pd(a.n, b.n);
2296 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2297 r.i64[i] = a.i64[i] | b.i64[i];
2309 #if defined(SIMDE_SSE2_NATIVE)
2310 r.n = _mm_or_si128(a.n, b.n);
2311 #elif defined(SIMDE_SSE2_NEON)
2312 r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
2315 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2316 r.i64[i] = a.i64[i] | b.i64[i];
2328 #if defined(SIMDE_SSE2_NATIVE)
2329 r.n = _mm_packs_epi16(a.n, b.n);
2330 #elif defined(SIMDE_SSE2_NEON)
2331 r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
2334 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2335 r.i8[i] = (a.i16[i] > INT8_MAX)
2337 : ((a.i16[i] < INT8_MIN)
2339 : ((int8_t)a.i16[i]));
2340 r.i8[i + 8] = (b.i16[i] > INT8_MAX)
2342 : ((b.i16[i] < INT8_MIN)
2344 : ((int8_t)b.i16[i]));
2356 #if defined(SIMDE_SSE2_NATIVE)
2357 r.n = _mm_packs_epi32(a.n, b.n);
2358 #elif defined(SIMDE_SSE2_NEON)
2360 vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
2363 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
2364 r.i16[i] = (a.i32[i] > INT16_MAX)
2366 : ((a.i32[i] < INT16_MIN)
2368 : ((int16_t)a.i32[i]));
2369 r.i16[i + 4] = (b.i32[i] > INT16_MAX)
2371 : ((b.i32[i] < INT16_MIN)
2373 : ((int16_t)b.i32[i]));
2385 #if defined(SIMDE_SSE2_NATIVE)
2386 r.n = _mm_packus_epi16(a.n, b.n);
2387 #elif defined(SIMDE_SSE2_NEON)
2389 vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
2392 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2393 r.u8[i] = (a.i16[i] > UINT8_MAX)
2395 : ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
2397 (b.i16[i] > UINT8_MAX)
2399 : ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
2409 #if defined(SIMDE_SSE2_NATIVE)
2419 #if defined(SIMDE_SSE2_NATIVE)
2420 r.n = _mm_sad_epu8(a.n, b.n);
2422 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2425 for (
size_t j = 0; j < ((
sizeof(r.u8) /
sizeof(r.u8[0])) / 2);
2427 const size_t e = j + (i * 8);
2428 tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
2429 : (b.u8[e] - a.u8[e]);
2440 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
2441 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
2442 int8_t e3, int8_t e2, int8_t e1, int8_t e0)
2446 #if defined(SIMDE_SSE2_NATIVE)
2447 r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
2473 int16_t e3, int16_t e2, int16_t e1, int16_t e0)
2477 #if defined(SIMDE_SSE2_NATIVE)
2478 r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
2479 #elif defined(SIMDE_SSE2_NEON)
2480 SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
2481 r.neon_i16 = vld1q_s16(data);
2501 #if defined(SIMDE_SSE2_NATIVE)
2502 r.n = _mm_set_epi32(e3, e2, e1, e0);
2503 #elif defined(SIMDE_SSE2_NEON)
2504 SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
2505 r.neon_i32 = vld1q_s32(data);
2521 #if defined(SIMDE_SSE2_NATIVE)
2522 r.n = _mm_set_epi64(e1.n, e0.n);
2524 r.i64[0] = e0.
i64[0];
2525 r.i64[1] = e1.
i64[0];
2536 #if defined(SIMDE_SSE2_NATIVE)
2537 r.n = _mm_set_epi64x(e1, e0);
2538 #elif defined(SIMDE_SSE2_NEON)
2539 r = SIMDE__M128I_NEON_C(i64,
2540 vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
2551 uint8_t e12, uint8_t e11, uint8_t e10,
2552 uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
2553 uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
2554 uint8_t e1, uint8_t e0)
2580 uint16_t e4, uint16_t e3, uint16_t e2,
2581 uint16_t e1, uint16_t e0)
2627 #if defined(SIMDE_SSE2_NATIVE)
2628 r.n = _mm_set_pd(e1, e0);
2642 #if defined(SIMDE_SSE2_NATIVE)
2643 r.n = _mm_set1_pd(a);
2657 #if defined(SIMDE_SSE2_NATIVE)
2658 r.n = _mm_set_sd(a);
2672 #if defined(SIMDE_SSE2_NATIVE)
2673 r.n = _mm_set1_epi8(a);
2674 #elif defined(SIMDE_SSE2_NEON)
2675 r.neon_i8 = vdupq_n_s8(a);
2678 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
2691 #if defined(SIMDE_SSE2_NATIVE)
2692 r.n = _mm_set1_epi16(a);
2693 #elif defined(SIMDE_SSE2_NEON)
2694 r.neon_i16 = vdupq_n_s16(a);
2697 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2710 #if defined(SIMDE_SSE2_NATIVE)
2711 r.n = _mm_set1_epi32(a);
2712 #elif defined(SIMDE_SSE2_NEON)
2713 r.neon_i32 = vdupq_n_s32(a);
2716 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
2729 #if defined(SIMDE_SSE2_NATIVE)
2730 r.n = _mm_set1_epi64x(a);
2731 #elif defined(SIMDE_SSE2_NEON)
2732 r.neon_i64 = vmovq_n_s64(a);
2735 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2748 #if defined(SIMDE_SSE2_NATIVE)
2749 r.n = _mm_set1_epi64(a.n);
2752 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2753 r.i64[i] = a.
i64[0];
2765 #if defined(SIMDE_SSE2_NATIVE)
2766 r.n = _mm_set1_pd(a);
2769 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2779 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
2780 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
2781 int8_t e3, int8_t e2, int8_t e1, int8_t e0)
2785 #if defined(SIMDE_SSE2_NATIVE)
2786 r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
2787 e4, e3, e2, e1, e0);
2788 #elif defined(SIMDE_SSE2_NEON)
2789 int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
2790 e7, e6, e5, e4, e3, e2, e1, e0};
2791 r.neon_i8 = vld1q_s8(t);
2816 int16_t e3, int16_t e2, int16_t e1, int16_t e0)
2820 #if defined(SIMDE_SSE2_NATIVE)
2821 r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
2822 #elif defined(SIMDE_SSE2_NEON)
2823 int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
2824 r.neon_i16 = vld1q_s16(t);
2844 #if defined(SIMDE_SSE2_NATIVE)
2845 r.n = _mm_setr_epi32(e3, e2, e1, e0);
2846 #elif defined(SIMDE_SSE2_NEON)
2847 int32_t t[] = {e3, e2, e1, e0};
2848 r.neon_i32 = vld1q_s32(t);
2864 #if defined(SIMDE_SSE2_NATIVE)
2865 r.n = _mm_setr_epi64(e1.n, e0.n);
2866 #elif defined(SIMDE_SSE2_NEON)
2867 r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
2869 r.i64[0] = e1.
i64[0];
2870 r.i64[1] = e0.
i64[0];
2881 #if defined(SIMDE_SSE2_NATIVE)
2882 r.n = _mm_setr_pd(e1, e0);
2896 #if defined(SIMDE_SSE2_NATIVE)
2897 r.n = _mm_setzero_pd();
2911 #if defined(SIMDE_SSE2_NATIVE)
2912 r.n = _mm_setzero_si128();
2913 #elif defined(SIMDE_SSE2_NEON)
2914 r.neon_i32 = vdupq_n_s32(0);
2928 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
2929 r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
2934 #if defined(SIMDE_SSE2_NATIVE)
2935 #define simde_mm_shuffle_epi32(a, imm8) \
2936 SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8)))
2937 #elif defined(SIMDE__SHUFFLE_VECTOR)
2938 #define simde_mm_shuffle_epi32(a, imm8) \
2940 const simde__m128i simde__tmp_a_ = a; \
2941 (simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \
2942 32, 16, (simde__tmp_a_).i32, \
2943 (simde__tmp_a_).i32, ((imm8)) & 3, \
2944 ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
2945 ((imm8) >> 6) & 3)}; \
2954 r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
2955 r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
2959 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
2960 #define simde_mm_shuffle_pd(a, b, imm8) \
2961 SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8)))
2962 #elif defined(SIMDE__SHUFFLE_VECTOR)
2963 #define simde_mm_shuffle_pd(a, b, imm8) \
2965 (simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \
2966 64, 16, (a).f64, (b).f64, \
2968 (((imm8) >> 1) & 1) + 2)}; \
2977 r.i64[0] = a.i64[0];
2978 for (
size_t i = 4; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2979 r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
2984 #if defined(SIMDE_SSE2_NATIVE)
2985 #define simde_mm_shufflehi_epi16(a, imm8) \
2986 SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8)))
2987 #elif defined(SIMDE__SHUFFLE_VECTOR)
2988 #define simde_mm_shufflehi_epi16(a, imm8) \
2990 const simde__m128i simde__tmp_a_ = a; \
2991 (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
2992 16, 16, (simde__tmp_a_).i16, \
2993 (simde__tmp_a_).i16, 0, 1, 2, 3, \
2994 (((imm8)) & 3) + 4, \
2995 (((imm8) >> 2) & 3) + 4, \
2996 (((imm8) >> 4) & 3) + 4, \
2997 (((imm8) >> 6) & 3) + 4)}; \
3006 for (
size_t i = 0; i < ((
sizeof(r.i16) /
sizeof(r.i16[0])) / 2); i++) {
3007 r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
3009 r.i64[1] = a.i64[1];
3013 #if defined(SIMDE_SSE2_NATIVE)
3014 #define simde_mm_shufflelo_epi16(a, imm8) \
3015 SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8)))
3016 #elif defined(SIMDE__SHUFFLE_VECTOR)
3017 #define simde_mm_shufflelo_epi16(a, imm8) \
3019 const simde__m128i simde__tmp_a_ = a; \
3020 (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
3021 16, 16, (simde__tmp_a_).i16, \
3022 (simde__tmp_a_).i16, (((imm8)) & 3), \
3023 (((imm8) >> 2) & 3), \
3024 (((imm8) >> 4) & 3), \
3025 (((imm8) >> 6) & 3), 4, 5, 6, 7)}; \
3032 #if defined(SIMDE_SSE2_NATIVE)
3033 return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
3037 if (count.u64[0] > 15)
3039 const int s = (int)(count.u64[0]);
3042 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
3043 r.u16[i] = a.u16[i] << s;
3052 #if defined(SIMDE_SSE2_NATIVE)
3053 return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
3057 if (count.u64[0] > 31)
3059 const int s = (int)(count.u64[0]);
3062 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3063 r.i32[i] = a.i32[i] << s;
3072 #if defined(SIMDE_SSE2_NATIVE)
3073 return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
3077 if (count.u64[0] > 63)
3079 const int s = (int)(count.u64[0]);
3082 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3083 r.i64[i] = a.i64[i] << s;
3092 #if defined(SIMDE_SSE2_NATIVE)
3093 return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
3098 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
3099 r.f64[i] = sqrt(a.f64[i]);
3109 #if defined(SIMDE_SSE2_NATIVE)
3110 return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
3113 r.f64[0] = sqrt(b.f64[0]);
3114 r.f64[1] = a.f64[1];
3122 #if defined(SIMDE_SSE2_NATIVE)
3123 return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
3127 if (count.u64[0] > 15)
3129 const int s = (int)(count.u64[0]);
3132 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
3133 r.u16[i] = a.u16[i] >> s;
3142 #if defined(SIMDE_SSE2_NATIVE)
3143 return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
3147 if (count.u64[0] > 31)
3149 const int s = (int)(count.u64[0]);
3152 for (
size_t i = 0; i < (
sizeof(r.u32) /
sizeof(r.u32[0])); i++) {
3153 r.u32[i] = a.u32[i] >> s;
3162 #if defined(SIMDE_SSE2_NATIVE)
3163 return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
3167 if (count.u64[0] > 31)
3169 const int s = (int)(count.u64[0]);
3172 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
3173 r.u64[i] = a.u64[i] >> s;
3185 (uint16_t)((~0U) << ((
sizeof(int16_t) * CHAR_BIT) - imm8));
3188 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.u16[0])); i++) {
3189 const uint16_t is_neg = ((uint16_t)(
3190 ((a.u16[i]) >> ((
sizeof(int16_t) * CHAR_BIT) - 1))));
3191 r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
3196 #if defined(SIMDE_SSE2_NATIVE)
3197 #define simde_mm_srai_epi16(a, imm8) \
3198 SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8)));
3207 (uint32_t)((~0U) << ((
sizeof(int) * CHAR_BIT) - imm8));
3209 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.u32[0])); i++) {
3210 uint32_t is_neg = ((uint32_t)(
3211 ((a.u32[i]) >> ((
sizeof(int32_t) * CHAR_BIT) - 1))));
3212 r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
3217 #if defined(SIMDE_SSE2_NATIVE)
3218 #define simde_mm_srai_epi32(a, imm8) \
3219 SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8)))
3220 #elif defined(SIMDE_SSE2_NEON)
3221 #define simde_mm_srai_epi32(a, imm8) \
3222 SIMDE__M128I_NEON_C( \
3227 ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \
3229 : (vshrq_n_s32(a.neon_i32, (imm8)))))
3235 #if defined(SIMDE_SSE2_NATIVE)
3236 return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
3239 int cnt = (int)count.i64[0];
3241 if (cnt > 15 || cnt < 0) {
3242 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0]));
3244 r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
3247 const uint16_t m = (uint16_t)(
3248 (~0U) << ((
sizeof(int16_t) * CHAR_BIT) - cnt));
3249 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0]));
3251 const uint16_t is_neg = a.i16[i] < 0;
3252 r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
3263 #if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
3264 return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
3267 const uint64_t cnt = count.u64[0];
3270 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0]));
3272 r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
3274 }
else if (cnt == 0) {
3275 memcpy(&r, &a,
sizeof(r));
3277 const uint32_t m = (uint32_t)(
3278 (~0U) << ((
sizeof(int32_t) * CHAR_BIT) - cnt));
3279 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0]));
3281 const uint32_t is_neg = a.i32[i] < 0;
3282 r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
3294 const int s = (imm8 > ((int)
sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
3297 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
3298 r.i16[i] = a.i16[i] << s;
3302 #if defined(SIMDE_SSE2_NATIVE)
3303 #define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8));
3304 #elif defined(SIMDE_SSE2_NEON)
3305 #define simde_mm_slli_epi16(a, imm8) \
3306 SIMDE__M128I_NEON_C( \
3307 i16, ((imm8) <= 0) \
3309 : (((imm8) > 31) ? (vdupq_n_s16(0)) \
3310 : (vshlq_n_s16((a).neon_i16, \
3318 const int s = (imm8 > ((int)
sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
3321 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3322 r.i32[i] = a.i32[i] << s;
3326 #if defined(SIMDE_SSE2_NATIVE)
3327 #define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8));
3328 #elif defined(SIMDE_SSE2_NEON)
3329 #define simde_mm_slli_epi32(a, imm8) \
3330 SIMDE__M128I_NEON_C( \
3331 i32, ((imm8) <= 0) \
3333 : (((imm8) > 31) ? (vdupq_n_s32(0)) \
3334 : (vshlq_n_s32((a).neon_i32, \
3342 const int s = (imm8 > ((int)
sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
3345 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3346 r.i64[i] = a.i64[i] << s;
3350 #if defined(SIMDE_SSE2_NATIVE)
3351 #define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8));
3358 const int s = (imm8 > ((int)
sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
3361 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
3362 r.u16[i] = a.u16[i] >> s;
3366 #if defined(SIMDE_SSE2_NATIVE)
3367 #define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8));
3368 #elif defined(SIMDE_SSE2_NEON)
3369 #define simde_mm_srli_epi16(a, imm8) \
3370 SIMDE__M128I_NEON_C( \
3371 u16, ((imm8) <= 0) \
3373 : (((imm8) > 31) ? (vdupq_n_u16(0)) \
3374 : (vshrq_n_u16((a).neon_u16, \
3382 const int s = (imm8 > ((int)
sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
3385 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3386 r.u32[i] = a.u32[i] >> s;
3390 #if defined(SIMDE_SSE2_NATIVE)
3391 #define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8))
3392 #elif defined(SIMDE_SSE2_NEON)
3393 #define simde_mm_srli_epi32(a, imm8) \
3394 SIMDE__M128I_NEON_C( \
3395 u32, ((imm8) <= 0) \
3397 : (((imm8) > 31) ? (vdupq_n_u32(0)) \
3398 : (vshrq_n_u32((a).neon_u32, \
3406 const unsigned char s = imm8 & 255;
3408 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3412 r.u64[i] = a.u64[i] >> s;
3417 #if defined(SIMDE_SSE2_NATIVE)
3418 #define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8))
3419 #elif defined(SIMDE_SSE2_NEON)
3420 #define simde_mm_srli_epi64(a, imm8) \
3421 SIMDE__M128I_NEON_C( \
3423 (((imm8)&255) < 0 || ((imm8)&255) > 63) \
3424 ? (vdupq_n_u64(0)) \
3425 : ((((imm8)&255) == 0) \
3427 : (vshrq_n_u64((a).neon_u64, (imm8)&255))))
3436 #if defined(SIMDE_SSE2_NATIVE)
3437 _mm_store_pd(mem_addr, a.n);
3440 memcpy(mem_addr, &a,
sizeof(a));
3450 #if defined(SIMDE_SSE2_NATIVE)
3451 _mm_store1_pd(mem_addr, a.n);
3454 mem_addr[0] = a.f64[0];
3455 mem_addr[1] = a.f64[0];
3458 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a)
3463 #if defined(SIMDE_SSE2_NATIVE)
3464 _mm_store_sd(mem_addr, a.n);
3466 memcpy(mem_addr, &a,
sizeof(a.f64[0]));
3473 #if defined(SIMDE_SSE2_NATIVE)
3474 _mm_store_si128(&mem_addr->n, a.n);
3475 #elif defined(SIMDE_SSE2_NEON)
3476 vst1q_s32((int32_t *)mem_addr, a.neon_i32);
3479 memcpy(mem_addr, &a,
sizeof(a));
3486 #if defined(SIMDE_SSE2_NATIVE)
3487 _mm_storeh_pd(mem_addr, a.n);
3489 *mem_addr = a.f64[1];
3496 #if defined(SIMDE_SSE2_NATIVE)
3497 _mm_storel_epi64(&(mem_addr->n), a.n);
3498 #elif defined(SIMDE_SSE2_NEON)
3499 mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
3501 mem_addr->i64[0] = a.i64[0];
3508 #if defined(SIMDE_SSE2_NATIVE)
3509 _mm_storel_pd(mem_addr, a.n);
3511 *mem_addr = a.f64[0];
3520 #if defined(SIMDE_SSE2_NATIVE)
3521 _mm_storer_pd(mem_addr, a.n);
3524 mem_addr[0] = a.f64[1];
3525 mem_addr[1] = a.f64[0];
3532 #if defined(SIMDE_SSE2_NATIVE)
3533 _mm_storeu_pd(mem_addr, a.n);
3535 memcpy(mem_addr, &a,
sizeof(a));
3542 #if defined(SIMDE_SSE2_NATIVE)
3543 _mm_storeu_si128(&mem_addr->n, a.n);
3544 #elif defined(SIMDE_SSE2_NEON)
3546 vst1q_s32(v, a.neon_i32);
3547 memcpy(mem_addr, v,
sizeof(v));
3549 memcpy(mem_addr, &a,
sizeof(a));
3557 #if defined(SIMDE_SSE2_NATIVE)
3558 _mm_stream_pd(mem_addr, a.n);
3561 memcpy(mem_addr, &a,
sizeof(a));
3568 #if defined(SIMDE_SSE2_NATIVE)
3569 _mm_stream_si128(&mem_addr->n, a.n);
3572 memcpy(mem_addr, &a,
sizeof(a));
3579 #if defined(SIMDE_SSE2_NATIVE)
3580 _mm_stream_si32(mem_addr, a);
3589 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3590 #if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
3592 #elif defined(__GNUC__)
3593 _mm_stream_si64((
long long *)mem_addr, a);
3595 _mm_stream_si64(mem_addr, a);
3605 #if defined(SIMDE_SSE2_NATIVE)
3606 return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
3607 #elif defined(SIMDE_SSE2_NEON)
3608 return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
3612 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
3613 r.i8[i] = a.i8[i] - b.i8[i];
3622 #if defined(SIMDE_SSE2_NATIVE)
3623 return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
3624 #elif defined(SIMDE_SSE2_NEON)
3625 return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
3629 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
3630 r.i16[i] = a.i16[i] - b.i16[i];
3639 #if defined(SIMDE_SSE2_NATIVE)
3640 return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
3641 #elif defined(SIMDE_SSE2_NEON)
3642 return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
3646 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3647 r.i32[i] = a.i32[i] - b.i32[i];
3656 #if defined(SIMDE_SSE2_NATIVE)
3657 return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
3658 #elif defined(SIMDE_SSE2_NEON)
3659 return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
3663 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3664 r.i64[i] = a.i64[i] - b.i64[i];
3673 #if defined(SIMDE_SSE2_NATIVE)
3674 return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
3678 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
3679 r.f64[i] = a.f64[i] - b.f64[i];
3688 #if defined(SIMDE_SSE2_NATIVE)
3689 return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
3692 r.f64[0] = a.f64[0] - b.f64[0];
3693 r.f64[1] = a.f64[1];
3701 #if defined(SIMDE_SSE2_NATIVE)
3702 return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
3713 #if defined(SIMDE_SSE2_NATIVE)
3714 return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
3715 #elif defined(SIMDE_SSE2_NEON)
3716 return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
3720 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i8[0])); i++) {
3721 if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
3723 }
else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
3726 r.i8[i] = (a.i8[i]) - (b.i8[i]);
3736 #if defined(SIMDE_SSE2_NATIVE)
3737 return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
3738 #elif defined(SIMDE_SSE2_NEON)
3739 return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
3743 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i16[0])); i++) {
3744 if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
3745 r.i16[i] = INT16_MIN;
3746 }
else if ((b.i16[i]) < 0 &&
3747 (a.i16[i]) > INT16_MAX + (b.i16[i])) {
3748 r.i16[i] = INT16_MAX;
3750 r.i16[i] = (a.i16[i]) - (b.i16[i]);
3760 #if defined(SIMDE_SSE2_NATIVE)
3761 return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
3762 #elif defined(SIMDE_SSE2_NEON)
3763 return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
3767 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i8[0])); i++) {
3768 const int32_t x = a.u8[i] - b.u8[i];
3771 }
else if (x > UINT8_MAX) {
3772 r.u8[i] = UINT8_MAX;
3774 r.u8[i] = (uint8_t)x;
3784 #if defined(SIMDE_SSE2_NATIVE)
3785 return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
3786 #elif defined(SIMDE_SSE2_NEON)
3787 return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
3791 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i16[0])); i++) {
3792 const int32_t x = a.u16[i] - b.u16[i];
3795 }
else if (x > UINT16_MAX) {
3796 r.u16[i] = UINT16_MAX;
3798 r.u16[i] = (uint16_t)x;
3808 #if defined(SIMDE_SSE2_NATIVE)
3809 return _mm_ucomieq_sd(a.n, b.n);
3812 int x = feholdexcept(&envp);
3813 int r = a.f64[0] == b.f64[0];
3823 #if defined(SIMDE_SSE2_NATIVE)
3824 return _mm_ucomige_sd(a.n, b.n);
3827 int x = feholdexcept(&envp);
3828 int r = a.f64[0] >= b.f64[0];
3838 #if defined(SIMDE_SSE2_NATIVE)
3839 return _mm_ucomigt_sd(a.n, b.n);
3842 int x = feholdexcept(&envp);
3843 int r = a.f64[0] > b.f64[0];
3853 #if defined(SIMDE_SSE2_NATIVE)
3854 return _mm_ucomile_sd(a.n, b.n);
3857 int x = feholdexcept(&envp);
3858 int r = a.f64[0] <= b.f64[0];
3868 #if defined(SIMDE_SSE2_NATIVE)
3869 return _mm_ucomilt_sd(a.n, b.n);
3872 int x = feholdexcept(&envp);
3873 int r = a.f64[0] < b.f64[0];
3883 #if defined(SIMDE_SSE2_NATIVE)
3884 return _mm_ucomineq_sd(a.n, b.n);
3887 int x = feholdexcept(&envp);
3888 int r = a.f64[0] != b.f64[0];
3900 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
3901 r.n = _mm_undefined_pd();
3914 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
3915 r.n = _mm_undefined_si128();
3926 #if defined(SIMDE_SSE2_NATIVE)
3936 #if defined(SIMDE_SSE2_NATIVE)
3946 #if defined(SIMDE_SSE2_NATIVE)
3947 return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
3948 #elif defined(SIMDE_SSE2_NEON)
3949 int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
3950 int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
3951 int8x8x2_t result = vzip_s8(a1, b1);
3952 return SIMDE__M128I_NEON_C(i8,
3953 vcombine_s8(result.val[0], result.val[1]));
3957 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i8[0])) / 2); i++) {
3958 r.i8[(i * 2)] = a.i8[i + ((
sizeof(r) /
sizeof(r.i8[0])) / 2)];
3960 b.i8[i + ((
sizeof(r) /
sizeof(r.i8[0])) / 2)];
3969 #if defined(SIMDE_SSE2_NATIVE)
3970 return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
3971 #elif defined(SIMDE_SSE2_NEON)
3972 int16x4_t a1 = vget_high_s16(a.neon_i16);
3973 int16x4_t b1 = vget_high_s16(b.neon_i16);
3974 int16x4x2_t result = vzip_s16(a1, b1);
3975 return SIMDE__M128I_NEON_C(i16,
3976 vcombine_s16(result.val[0], result.val[1]));
3980 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i16[0])) / 2); i++) {
3982 a.i16[i + ((
sizeof(r) /
sizeof(r.i16[0])) / 2)];
3983 r.i16[(i * 2) + 1] =
3984 b.i16[i + ((
sizeof(r) /
sizeof(r.i16[0])) / 2)];
3993 #if defined(SIMDE_SSE2_NATIVE)
3994 return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
3995 #elif defined(SIMDE_SSE2_NEON)
3996 int32x2_t a1 = vget_high_s32(a.neon_i32);
3997 int32x2_t b1 = vget_high_s32(b.neon_i32);
3998 int32x2x2_t result = vzip_s32(a1, b1);
3999 return SIMDE__M128I_NEON_C(i32,
4000 vcombine_s32(result.val[0], result.val[1]));
4004 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i32[0])) / 2); i++) {
4006 a.i32[i + ((
sizeof(r) /
sizeof(r.i32[0])) / 2)];
4007 r.i32[(i * 2) + 1] =
4008 b.i32[i + ((
sizeof(r) /
sizeof(r.i32[0])) / 2)];
4017 #if defined(SIMDE_SSE2_NATIVE)
4018 return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
4022 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i64[0])) / 2); i++) {
4024 a.i64[i + ((
sizeof(r) /
sizeof(r.i64[0])) / 2)];
4025 r.i64[(i * 2) + 1] =
4026 b.i64[i + ((
sizeof(r) /
sizeof(r.i64[0])) / 2)];
4035 #if defined(SIMDE_SSE2_NATIVE)
4036 return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
4040 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.f64[0])) / 2); i++) {
4042 a.f64[i + ((
sizeof(r) /
sizeof(r.f64[0])) / 2)];
4043 r.f64[(i * 2) + 1] =
4044 b.f64[i + ((
sizeof(r) /
sizeof(r.f64[0])) / 2)];
4053 #if defined(SIMDE_SSE2_NATIVE)
4054 return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
4055 #elif defined(SIMDE_SSE2_NEON)
4056 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
4057 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
4058 int8x8x2_t result = vzip_s8(a1, b1);
4059 return SIMDE__M128I_NEON_C(i8,
4060 vcombine_s8(result.val[0], result.val[1]));
4064 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i8[0])) / 2); i++) {
4065 r.i8[(i * 2)] = a.i8[i];
4066 r.i8[(i * 2) + 1] = b.i8[i];
4075 #if defined(SIMDE_SSE2_NATIVE)
4076 return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
4077 #elif defined(SIMDE_SSE2_NEON)
4078 int16x4_t a1 = vget_low_s16(a.neon_i16);
4079 int16x4_t b1 = vget_low_s16(b.neon_i16);
4080 int16x4x2_t result = vzip_s16(a1, b1);
4081 return SIMDE__M128I_NEON_C(i16,
4082 vcombine_s16(result.val[0], result.val[1]));
4086 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i16[0])) / 2); i++) {
4087 r.i16[(i * 2)] = a.i16[i];
4088 r.i16[(i * 2) + 1] = b.i16[i];
4097 #if defined(SIMDE_SSE2_NATIVE)
4098 return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
4099 #elif defined(SIMDE_SSE2_NEON)
4100 int32x2_t a1 = vget_low_s32(a.neon_i32);
4101 int32x2_t b1 = vget_low_s32(b.neon_i32);
4102 int32x2x2_t result = vzip_s32(a1, b1);
4103 return SIMDE__M128I_NEON_C(i32,
4104 vcombine_s32(result.val[0], result.val[1]));
4108 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i32[0])) / 2); i++) {
4109 r.i32[(i * 2)] = a.i32[i];
4110 r.i32[(i * 2) + 1] = b.i32[i];
4119 #if defined(SIMDE_SSE2_NATIVE)
4120 return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
4124 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i64[0])) / 2); i++) {
4125 r.i64[(i * 2)] = a.i64[i];
4126 r.i64[(i * 2) + 1] = b.i64[i];
4135 #if defined(SIMDE_SSE2_NATIVE)
4136 return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
4140 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.f64[0])) / 2); i++) {
4141 r.f64[(i * 2)] = a.f64[i];
4142 r.f64[(i * 2) + 1] = b.f64[i];
4151 #if defined(SIMDE_SSE2_NATIVE)
4152 return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
4156 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
4157 r.i64[i] = a.i64[i] ^ b.i64[i];
4166 #if defined(SIMDE_SSE2_NATIVE)
4167 return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
4168 #elif defined(SIMDE_SSE2_NEON)
4169 return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
4173 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
4174 r.i32[i] = a.i32[i] ^ b.i32[i];
4183 #if defined(SIMDE_SSE2_NEON)
4184 return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
4188 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
4189 r.i32[i] = ~(a.i32[i]);