Size | Untyped | Integer | Float | ||||||
---|---|---|---|---|---|---|---|---|---|
unsized | scalar | ||||||||
64 bit | int8 | int16 | int32 | int64 | |||||
128 bit | int8 | int16 | int32 | int64 | int128 | float32 | float64 | ||
256 bit | unsized | int8 | int16 | int32 | int64 | int128 | int256 | float32 | float64 |
Scalar:
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
control | ||||
M | clear 8087 float stack | void _mm_empty(void) | void __builtin_ia32_femms(void) | emms |
S | set state | void _mm_setcsr(unsigned int) | (assign) | ldmxcsr |
S | set state | void _MM_SET_EXCEPTION_MASK(unsigned int) | (assign) | - |
S | set state | void _MM_SET_EXCEPTION_STATE(unsigned int) | (assign) | - |
S | set state | void _MM_SET_FLUSH_ZERO_MODE(unsigned int) | (assign) | - |
S | set state | void _MM_SET_ROUNDING_MODE(unsigned int) | (assign) | - |
S | get state | unsigned int _mm_getcsr(void) | (assign) | stmxcsr |
S | get state | unsigned int _MM_GET_EXCEPTION_MASK() | (assign) | - |
S | get state | unsigned int _MM_GET_EXCEPTION_STATE() | (assign) | - |
S | get state | unsigned int _MM_GET_FLUSH_ZERO_MODE() | (assign) | - |
S | get state | unsigned int _MM_GET_ROUNDING_MODE() | (assign) | - |
memory | ||||
S | alloc | void*_mm_malloc(size_t size,size_t align) | void* aligned_alloc(size_t alignment,size_t size) void* memalign(size_t boundary,size_t size) |
- |
S | free | void _mm_free(void*mem_addr) | void free(void*ptr) | - |
store | ||||
S2 | store.nt i32 | void _mm_stream_si32(int*mem_addr,int) | void __builtin_ia32_movnti(int*,int) | movnti |
S2 | store.nt i64 | void _mm_stream_si64(__int64*mem_addr,__int64) | void __builtin_ia32_movnti64(long long int*,long long int) | movnti |
cache | ||||
S | prefetch skipping some cache levels | void _mm_prefetch(char const*p,int i) | void __builtin_prefetch(char*,_mm_hint) | prefetch0 prefetch1 prefetch2 prefetchnta |
S2 | flush and invalidate cache line | void _mm_clflush(void const*p) | void __builtin_ia32_clflush(const void*) | clflush |
sync | ||||
S | wait for store completion | void _mm_sfence(void) | void __builtin_ia32_sfence(void) | sfence |
S2 | wait for load completion | void _mm_lfence(void) | void __builtin_ia32_lfence(void) | lfence |
S2 | wait for mem ops completion | void _mm_mfence(void) | void __builtin_ia32_mfence(void) | mfence |
SM | monitor | void _mm_monitor(void const*p,unsigned extensions,unsigned hints) | void __builtin_ia32_monitor(void*,unsigned int,unsigned int) | monitor |
SM | mwait | void _mm_mwait(unsigned extensions,unsigned hints) | void __builtin_ia32_mwait(unsigned int,unsigned int) | mwait |
S2 | sleep(120clk) | void _mm_pause(void) | void __builtin_ia32_pause(void) | pause |
MMX: 64 bit
MMX(64 bit): int8 × 8
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
store | ||||
S | store.nt (masked) di ≔ ai if bi<0 |
void _mm_maskmove_si64(__m64,__m64 mask,char*mem_addr) | void __builtin_ia32_maskmovq(v8qi,v8qi,char*) | maskmovq |
extract | ||||
S | extract signbits | int _mm_movemask_pi8(__m64) | int __builtin_ia32_pmovmskb(v8qi) | pmovmskb |
add | ||||
M | add | __m64 _mm_add_pi8(__m64,__m64) | v8qi __builtin_ia32_paddb(v8qi,v8qi) | paddb |
M | adds.s | __m64 _mm_adds_pi8(__m64,__m64) | v8qi __builtin_ia32_paddsb(v8qi,v8qi) | paddsb |
M | adds.u | __m64 _mm_adds_pu8(__m64,__m64) | v8qi __builtin_ia32_paddusb(v8qi,v8qi) | paddusb |
S | average.u | __m64 _mm_avg_pu8(__m64,__m64) | v8qi __builtin_ia32_pavgb(v8qi,v8qi) | pavgb |
sub | ||||
M | sub | __m64 _mm_sub_pi8(__m64,__m64) | v8qi __builtin_ia32_psubb(v8qi,v8qi) | psubb |
M | subs.s | __m64 _mm_subs_pi8(__m64,__m64) | v8qi __builtin_ia32_psubsb(v8qi,v8qi) | psubsb |
M | subs.u | __m64 _mm_subs_pu8(__m64,__m64) | v8qi __builtin_ia32_psubusb(v8qi,v8qi) | psubusb |
S | ∑abs(Δ) ⟨0,0,0,∑7-0⟩ |
__m64 _mm_sad_pu8(__m64,__m64) | v1di __builtin_ia32_psadbw(v8qi,v8qi) | psadbw |
mul | ||||
SE | mul ai×sign(bi) | __m64 _mm_sign_pi8(__m64,__m64) | v8qi __builtin_ia32_psignb(v8qi,v8qi) | psignb |
SE | 4×scalprod.u saturated | __m64 _mm_maddubs_pi16(__m64,__m64) | v4hi __builtin_ia32_pmaddubsw(v8qi,v8qi) | pmaddubsw |
math | ||||
S | min.u | __m64 _mm_min_pu8(__m64,__m64) | v8qi __builtin_ia32_pminub(v8qi,v8qi) | pminub |
S | max.u | __m64 _mm_max_pu8(__m64,__m64) | v8qi __builtin_ia32_pmaxub(v8qi,v8qi) | pmaxub |
SE | abs.s | __m64 _mm_abs_pi8(__m64) | v8qi __builtin_ia32_pabsb(v8qi) | pabsb |
logical | ||||
M | ∧ | __m64 _mm_and_si64(__m64,__m64) | di __builtin_ia32_pand(di,di) | pand |
M | ¬∧ | __m64 _mm_andnot_si64(__m64,__m64) | di __builtin_ia32_pandn(di,di) | pandn |
M | ∨ | __m64 _mm_or_si64(__m64,__m64) | di __builtin_ia32_por(di,di) | por |
M | ⊕ | __m64 _mm_xor_si64(__m64,__m64) | di __builtin_ia32_pxor(di,di) | pxor |
shift | ||||
SE | shift bytes right ⟨b7:0,a7:0⟩ |
__m64 _mm_alignr_pi8(__m64,__m64,int imm8) | v1di __builtin_ia32_palignr(v1di,v1di,int) | palignr |
compare | ||||
M | = | __m64 _mm_cmpeq_pi8(__m64,__m64) | v8qi __builtin_ia32_pcmpeqb(v8qi,v8qi) | pcmpeqb |
M | > | __m64 _mm_cmpgt_pi8(__m64,__m64) | v8qi __builtin_ia32_pcmpgtb(v8qi,v8qi) | pcmpgtb |
shuffle | ||||
M | interleave ⟨b3,a3,b2,a2⟩ ⟨b1,a1,b0,a0⟩ |
__m64 _mm_unpacklo_pi8(__m64,__m64) | v8qi __builtin_ia32_punpcklbw(v8qi,v8qi) | punpcklbw |
M | interleave ⟨b7,a7,b6,a6⟩ ⟨b5,a5,b4,a4⟩ |
__m64 _mm_unpackhi_pi8(__m64,__m64) | v8qi __builtin_ia32_punpckhbw(v8qi,v8qi) | punpckhbw |
SE | shuffle with zero ⟨ab7,…,ab0⟩ di≔0 if bi<0 |
__m64 _mm_shuffle_pi8(__m64,__m64) | v8qi __builtin_ia32_pshufb(v8qi,v8qi) | pshufb |
MMX(64 bit): int16 × 4
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
insert | ||||
S | insert ai i16 | __m64 _mm_insert_pi16(__m64,int i,int imm8) | v4hi __builtin_ia32_pinsrw(v4hi,int,int) | pinsrw |
extract | ||||
S | extract ai i16 | int _mm_extract_pi16(__m64,int imm8) | int __builtin_ia32_pextrw(v4hi,int) | pextrw |
add | ||||
M | add | __m64 _mm_add_pi16(__m64,__m64) | v4hi __builtin_ia32_paddw(v4hi,v4hi) | paddw |
SE | add ⟨b2+b3,b0+b1,a2+a3,a0+a1⟩ |
__m64 _mm_hadd_pi16(__m64,__m64) | v4hi __builtin_ia32_phaddw(v4hi,v4hi) | phaddw |
M | adds.s | __m64 _mm_adds_pi16(__m64,__m64) | v4hi __builtin_ia32_paddsw(v4hi,v4hi) | paddsw |
M | adds.u | __m64 _mm_adds_pu16(__m64,__m64) | v4hi __builtin_ia32_paddusw(v4hi,v4hi) | paddusw |
SE | adds.s ⟨b2+b3,b0+b1,a2+a3,a0+a1⟩ |
__m64 _mm_hadds_pi16(__m64,__m64) | v4hi __builtin_ia32_phaddsw(v4hi,v4hi) | phaddsw |
S | average.u | __m64 _mm_avg_pu16(__m64,__m64) | v4hi __builtin_ia32_pavgw(v4hi,v4hi) | pavgw |
sub | ||||
M | sub | __m64 _mm_sub_pi16(__m64,__m64) | v4hi __builtin_ia32_psubw(v4hi,v4hi) | psubw |
SE | sub ⟨b2–b3,b0–b1,a2–a3,a0–a1⟩ |
__m64 _mm_hsub_pi16(__m64,__m64) | v4hi __builtin_ia32_phsubw(v4hi,v4hi) | phsubw |
M | subs.s | __m64 _mm_subs_pi16(__m64,__m64) | v4hi __builtin_ia32_psubsw(v4hi,v4hi) | psubsw |
M | subs.u | __m64 _mm_subs_pu16(__m64,__m64) | v4hi __builtin_ia32_psubusw(v4hi,v4hi) | psubusw |
SE | subs ⟨b2–b3,b0–b1,a2–a3,a0–a1⟩ |
__m64 _mm_hsubs_pi16(__m64,__m64) | v4hi __builtin_ia32_phsubsw(v4hi,v4hi) | phsubsw |
mul | ||||
M | mul(lo) | __m64 _mm_mullo_pi16(__m64,__m64) | v4hi __builtin_ia32_pmullw(v4hi,v4hi) | pmullw |
M | mul(hi).s | __m64 _mm_mulhi_pi16(__m64,__m64) | v4hi __builtin_ia32_pmulhw(v4hi,v4hi) | pmulhw |
S | mul(hi).u | __m64 _mm_mulhi_pu16(__m64,__m64) | v4hi __builtin_ia32_pmulhuw(v4hi,v4hi) | pmulhuw |
SE | mul(≫ 15).s | __m64 _mm_mulhrs_pi16(__m64,__m64) | v4hi __builtin_ia32_pmulhrsw(v4hi,v4hi) | pmulhrsw |
SE | mul ai×sign(bi) | __m64 _mm_sign_pi16(__m64,__m64) | v4hi __builtin_ia32_psignw(v4hi,v4hi) | psignw |
M | 2×scalprod.s unsaturated | __m64 _mm_madd_pi16(__m64,__m64) | v2si __builtin_ia32_pmaddwd(v4hi,v4hi) | pmaddwd |
math | ||||
S | min.s | __m64 _mm_min_pi16(__m64,__m64) | v4hi __builtin_ia32_pminsw(v4hi,v4hi) | pminsw |
S | max.s | __m64 _mm_max_pi16(__m64,__m64) | v4hi __builtin_ia32_pmaxsw(v4hi,v4hi) | pmaxsw |
SE | abs.s | __m64 _mm_abs_pi16(__m64) | v4hi __builtin_ia32_pabsw(v4hi) | pabsw |
logical | ||||
M | ∧ | __m64 _mm_and_si64(__m64,__m64) | di __builtin_ia32_pand(di,di) | pand |
M | ¬∧ | __m64 _mm_andnot_si64(__m64,__m64) | di __builtin_ia32_pandn(di,di) | pandn |
M | ∨ | __m64 _mm_or_si64(__m64,__m64) | di __builtin_ia32_por(di,di) | por |
M | ⊕ | __m64 _mm_xor_si64(__m64,__m64) | di __builtin_ia32_pxor(di,di) | pxor |
shift | ||||
M | shift left | __m64 _mm_sll_pi16(__m64,__m64 count) | v4hi __builtin_ia32_psllw(v4hi,v4hi) | psllw |
M | shift left imm | __m64 _mm_slli_pi16(__m64,int imm8) | v4hi __builtin_ia32_psllwi(v4hi,int) | psllw |
M | shift right.s | __m64 _mm_sra_pi16(__m64,__m64 count) | v4hi __builtin_ia32_psraw(v4hi,v4hi) | psraw |
M | shift right.s imm | __m64 _mm_srai_pi16(__m64,int imm8) | v4hi __builtin_ia32_psrawi(v4hi,int) | psraw |
M | shift right.u | __m64 _mm_srl_pi16(__m64,__m64 count) | v4hi __builtin_ia32_psrlw(v4hi,v4hi) | psrlw |
M | shift right.u imm | __m64 _mm_srli_pi16(__m64,int imm8) | v4hi __builtin_ia32_psrlwi(v4hi,int) | psrlw |
compare | ||||
M | = | __m64 _mm_cmpeq_pi16(__m64,__m64) | v4hi __builtin_ia32_pcmpeqw(v4hi,v4hi) | pcmpeqw |
M | > | __m64 _mm_cmpgt_pi16(__m64,__m64) | v4hi __builtin_ia32_pcmpgtw(v4hi,v4hi) | pcmpgtw |
reduce | ||||
M | reduce+saturate.s | __m64 _mm_packs_pi16(__m64,__m64) | v8qi __builtin_ia32_packsswb(v4hi,v4hi) | packsswb |
M | reduce+saturate.u | __m64 _mm_packs_pu16(__m64,__m64) | v8qi __builtin_ia32_packuswb(v4hi,v4hi) | packuswb |
shuffle | ||||
M | interleave ⟨b1,a1,b0,a0⟩ | __m64 _mm_unpacklo_pi16(__m64,__m64) | v4hi __builtin_ia32_punpcklwd(v4hi,v4hi) | punpcklwd |
M | interleave ⟨b3,a3,b2,a2⟩ | __m64 _mm_unpackhi_pi16(__m64,__m64) | v4hi __builtin_ia32_punpckhwd(v4hi,v4hi) | punpckhwd |
S | shuffle ⟨al,ak,aj,ai⟩ |
__m64 _mm_shuffle_pi16(__m64,int imm8) | v4hi __builtin_ia32_pshufw(v4hi,v4hi) | pshufw |
MMX(64 bit): int32 × 2
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
convert | ||||
M | create ⟨0,i32⟩ | __m64 _mm_cvtsi32_si64(int) | (cast) | movd |
extract | ||||
M | extract a0 i32 | int _mm_cvtsi64_si32(__m64) | (cast) | movd |
add | ||||
M | add | __m64 _mm_add_pi32(__m64,__m64) | v2si __builtin_ia32_paddd(v2si,v2si) | paddd |
SE | add ⟨b0+b1,a0+a1⟩ |
__m64 _mm_hadd_pi32(__m64,__m64) | v2si __builtin_ia32_phaddd(v2si,v2si) | phaddd |
sub | ||||
M | sub | __m64 _mm_sub_pi32(__m64,__m64) | v2si __builtin_ia32_psubd(v2si,v2si) | psubd |
SE | sub ⟨b0–b1,a0–a1⟩ |
__m64 _mm_hsub_pi32(__m64,__m64) | v2si __builtin_ia32_phsubd(v2si,v2si) | phsubd |
mul | ||||
S2 | mul ⟨a0×b0⟩→i64 | __m64 _mm_mul_su32(__m64,__m64) | v1di __builtin_ia32_pmuludq(v2si,v2si) | pmuludq |
SE | mul ai×sign(bi) | __m64 _mm_sign_pi32(__m64,__m64) | v2si __builtin_ia32_psignd(v2si,v2si) | psignd |
math | ||||
SE | abs.s | __m64 _mm_abs_pi32(__m64) | v2si __builtin_ia32_pabsd(v2si) | pabsd |
logical | ||||
M | ∧ | __m64 _mm_and_si64(__m64,__m64) | di __builtin_ia32_pand(di,di) | pand |
M | ¬∧ | __m64 _mm_andnot_si64(__m64,__m64) | di __builtin_ia32_pandn(di,di) | pandn |
M | ∨ | __m64 _mm_or_si64(__m64,__m64) | di __builtin_ia32_por(di,di) | por |
M | ⊕ | __m64 _mm_xor_si64(__m64,__m64) | di __builtin_ia32_pxor(di,di) | pxor |
shift | ||||
M | shift left | __m64 _mm_sll_pi32(__m64,__m64 count) | v2si __builtin_ia32_pslld(v2si,v2si) | pslld |
M | shift left imm | __m64 _mm_slli_pi32(__m64,int imm8) | v2si __builtin_ia32_pslldi(v2si,int) | pslld |
M | shift right.s | __m64 _mm_sra_pi32(__m64,__m64 count) | v2si __builtin_ia32_psrad(v2si,v2si) | psrad |
M | shift right.s imm | __m64 _mm_srai_pi32(__m64,int imm8) | v2si __builtin_ia32_psradi(v2si,int) | psrad |
M | shift right.u | __m64 _mm_srl_pi32(__m64,__m64 count) | v2si __builtin_ia32_psrld(v2si,v2si) | psrld |
M | shift right.u imm | __m64 _mm_srli_pi32(__m64,int imm8) | v2si __builtin_ia32_psrldi(v2si,int) | psrld |
compare | ||||
M | = | __m64 _mm_cmpeq_pi32(__m64,__m64) | v2si __builtin_ia32_pcmpeqd(v2si,v2si) | pcmpeqd |
M | > | __m64 _mm_cmpgt_pi32(__m64,__m64) | v2si __builtin_ia32_pcmpgtd(v2si,v2si) | pcmpgtd |
reduce | ||||
M | reduce+saturate.s ⟨b1,b0,a1,a0⟩ |
__m64 _mm_packs_pi32(__m64,__m64) | v4hi __builtin_ia32_packssdw(v2si,v2si) | packssdw |
shuffle | ||||
M | interleave ⟨b0,a0⟩ | __m64 _mm_unpacklo_pi32(__m64,__m64) | v2si __builtin_ia32_punpckldq(v2si,v2si) | punpckldq |
M | interleave ⟨b1,a1⟩ | __m64 _mm_unpackhi_pi32(__m64,__m64) | v2si __builtin_ia32_punpckhdq(v2si,v2si) | punpckhdq |
MMX(64 bit): int64
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
M | const(0) | __m64 _mm_setzero_si64(void) | (cast) | pxor |
store | ||||
S | store.a8.nt | void _mm_stream_pi(__m64*mem_addr,__m64) | void __builtin_ia32_movntq(di*,di) | movntq |
convert | ||||
M | create ⟨i64⟩ | __m64 _mm_cvtsi64_m64(__int64) | (cast) | movq |
extract | ||||
M | extract a0 i64 | __int64 _mm_cvtm64_si64(__m64) | (cast) | movq |
add | ||||
S2 | add | __m64 _mm_add_si64(__m64,__m64) | (op) | paddq |
sub | ||||
S2 | sub | __m64 _mm_sub_si64(__m64,__m64) | (op) | psubq |
logical | ||||
M | ∧ | __m64 _mm_and_si64(__m64,__m64) | di __builtin_ia32_pand(di,di) | pand |
M | ¬∧ | __m64 _mm_andnot_si64(__m64,__m64) | di __builtin_ia32_pandn(di,di) | pandn |
M | ∨ | __m64 _mm_or_si64(__m64,__m64) | di __builtin_ia32_por(di,di) | por |
M | ⊕ | __m64 _mm_xor_si64(__m64,__m64) | di __builtin_ia32_pxor(di,di) | pxor |
shift | ||||
M | shift left | __m64 _mm_sll_si64(__m64,__m64 count) | v1di __builtin_ia32_psllq(v1di,v1di) | psllq |
M | shift left imm | __m64 _mm_slli_si64(__m64,int imm8) | v1di __builtin_ia32_psllqi(v1di,int) | psllq |
M | shift right.u | __m64 _mm_srl_si64(__m64,__m64 count) | v1di __builtin_ia32_psrlq(v1di,v1di) | psrlq |
M | shift right.u imm | __m64 _mm_srli_si64(__m64,int imm8) | v1di __builtin_ia32_psrlqi(v1di,int) | psrlq |
SSE: 128 Bits
SSE(128 bit): int8 × 16
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
load | ||||
S2 | load.u | __m128i _mm_loadu_si128(__m128i const*mem_addr) | v16qi __builtin_ia32_loaddqu(const char*) | movdqu |
store | ||||
S2 | store.u | void _mm_storeu_si128(__m128i*mem_addr,__m128i) | void __builtin_ia32_storedqu(char*,v16qi) | movdqu |
S2 | store.u.nt (masked) mi≔ai if bi<0 |
void _mm_maskmoveu_si128(__m128i,__m128i mask,char*mem_addr) | void __builtin_ia32_maskmovdqu(v16qi,v16qi,char*) | maskmovdqu |
insert | ||||
S4 | insert ai i8 | __m128i _mm_insert_epi8(__m128i,int i,const int imm8) | void __builtin_ia32_vec_set_v16qi(v16qi,int,int) | pinsrb |
extract | ||||
S4 | extract ai i8 | int _mm_extract_epi8(__m128i,const int imm8) | uint8 __builtin_ia32_vec_ext_v16qi(v16qi,int) | pextrb |
S2 | extract signbits | int _mm_movemask_epi8(__m128i) | int __builtin_ia32_pmovmskb128(v16qi) | pmovmskb |
add | ||||
S2 | add | __m128i _mm_add_epi8(__m128i,__m128i) | v16qi __builtin_ia32_paddb128(v16qi,v16qi) | paddb |
S2 | adds.s | __m128i _mm_adds_epi8(__m128i,__m128i) | v16qi __builtin_ia32_paddsb128(v16qi,v16qi) | paddsb |
S2 | adds.u | __m128i _mm_adds_epu8(__m128i,__m128i) | v16qi __builtin_ia32_paddusb128(v16qi,v16qi) | paddusb |
S2 | average.u | __m128i _mm_avg_epu8(__m128i,__m128i) | v16qi __builtin_ia32_pavgb128(v16qi,v16qi) | pavgb |
sub | ||||
S2 | sub | __m128i _mm_sub_epi8(__m128i,__m128i) | v16qi __builtin_ia32_psubb128(v16qi,v16qi) | psubb |
S2 | subs.s | __m128i _mm_subs_epi8(__m128i,__m128i) | v16qi __builtin_ia32_psubsb128(v16qi,v16qi) | psubsb |
S2 | subs.u | __m128i _mm_subs_epu8(__m128i,__m128i) | v16qi __builtin_ia32_psubusb128(v16qi,v16qi) | psubusb |
S2 | ∑abs(Δ) ⟨0,0,∑15-8,∑7-0⟩ |
__m128i _mm_sad_epu8(__m128i,__m128i) | v2di __builtin_ia32_psadbw128(v16qi,v16qi) | psadbw |
S4 | i16 dj≔ ∑i∊{0…3}Δi
j∊{0..7} Δi=abs(ap+j+i-bq+i) p∊{0,4}, q∊{0,4,8,12} |
__m128i _mm_mpsadbw_epu8(__m128i,__m128i,const int imm8) | v8hi __builtin_ia32_mpsadbw128(v16qi,v16qi,int) | mpsadbw |
mul | ||||
SE | mul ai×sign(bi) | __m128i _mm_sign_epi8(__m128i,__m128i) | v16qi __builtin_ia32_psignb128(v16qi,v16qi) | psignb |
SE | 8×scalprod.u saturated | __m128i _mm_maddubs_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pmaddubsw128(v16qi,v16qi) | pmaddubsw |
math | ||||
S4 | min.s | __m128i _mm_min_epi8(__m128i,__m128i) | v16qi __builtin_ia32_pminsb128(v16qi,v16qi) | pminsb |
S2 | min.u | __m128i _mm_min_epu8(__m128i,__m128i) | v16qi __builtin_ia32_pminub128(v16qi,v16qi) | pminub |
S4 | max.s | __m128i _mm_max_epi8(__m128i,__m128i) | v16qi __builtin_ia32_pmaxsb128(v16qi,v16qi) | pmaxsb |
S2 | max.u | __m128i _mm_max_epu8(__m128i,__m128i) | v16qi __builtin_ia32_pmaxub128(v16qi,v16qi) | pmaxub |
SE | abs.s | __m128i _mm_abs_epi8(__m128i) | v16qi __builtin_ia32_pabsb128(v16qi) | pabsb |
logical | ||||
S2 | ∧ | __m128i _mm_and_si128(__m128i,__m128i) | v2di __builtin_ia32_pand128(v2di,v2di) | pand |
S2 | ¬∧ | __m128i _mm_andnot_si128(__m128i,__m128i) | v2di __builtin_ia32_pandn128(v2di,v2di) | pandn |
S2 | ∨ | __m128i _mm_or_si128(__m128i,__m128i) | v2di __builtin_ia32_por128(v2di,v2di) | por |
S2 | ⊕ | __m128i _mm_xor_si128(__m128i,__m128i) | v2di __builtin_ia32_pxor128(v2di,v2di) | pxor |
shift | ||||
SE | shift bytes right ⟨b15:0,a15:0⟩ |
__m128i _mm_alignr_epi8(__m128i,__m128i,int imm8) | v2di __builtin_ia32_palignr128(v2di,v2di,int) | palignr |
compare | ||||
S2 | = | __m128i _mm_cmpeq_epi8(__m128i,__m128i) | v16qi __builtin_ia32_pcmpeqb128(v16qi,v16qi) | pcmpeqb |
S2 | > | __m128i _mm_cmpgt_epi8(__m128i,__m128i) | v16qi __builtin_ia32_pcmpgtb128(v16qi,v16qi) | pcmpgtb |
S2 | < | __m128i _mm_cmplt_epi8(__m128i,__m128i) | (argswap) | pcmpgtb |
shuffle | ||||
A2 | dup ⟨a0,…,a0⟩ | __m128i _mm_broadcastb_epi8(__m128i) | v16qi __builtin_ia32_pbroadcastb128(v16qi) | vpbroadcastb |
S2 | interleave ⟨b7,a7,b6,a6⟩ ⟨b5,a5,b4,a4⟩ ⟨b3,a3,b2,a2⟩ ⟨b1,a1,b0,a0⟩ |
__m128i _mm_unpacklo_epi8(__m128i,__m128i) | v16qi __builtin_ia32_punpcklbw128(v16qi,v16qi) | punpcklbw |
S2 | interleave ⟨b15,a15,b14,a14⟩ ⟨b13,a13,b12,a12⟩ ⟨b11,a11,b10,a10⟩ ⟨b9,a9,b8,a8⟩ |
__m128i _mm_unpackhi_epi8(__m128i,__m128i) | v16qi __builtin_ia32_punpckhbw128(v16qi,v16qi) | punpckhbw |
S4 | blend ⟨cii<0 ? bi : ai⟩ | __m128i _mm_blendv_epi8(__m128i,__m128i,__m128i mask) | v16qi __builtin_ia32_pblendvb128(v16qi,v16qi,v16qi) | pblendvb |
SE | shuffle with zero ⟨ab15,…,ab0⟩ di≔0 if bi<0 |
__m128i _mm_shuffle_epi8(__m128i,__m128i) | v16qi __builtin_ia32_pshufb128(v16qi,v16qi) | pshufb |
SSE(128 bit): int16 × 8
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
convert | ||||
S4 | convert.s ← i8[7:0/16] | __m128i _mm_cvtepi8_epi16(__m128i) | v8hi __builtin_ia32_pmovsxbw128(v16qi) | pmovsxbw |
S4 | convert.u ← u8[7:0/16] | __m128i _mm_cvtepu8_epi16(__m128i) | v8hi __builtin_ia32_pmovzxbw128(v16qi) | pmovzxbw |
insert | ||||
S2 | insert ai i16 | __m128i _mm_insert_epi16(__m128i,int i,int imm8) | (indexing) | pinsrw |
extract | ||||
S2 | extract ai i16 | int _mm_extract_epi16(__m128i,int imm8) | (indexing) | pextrw |
add | ||||
S2 | add | __m128i _mm_add_epi16(__m128i,__m128i) | v8hi __builtin_ia32_paddw128(v8hi,v8hi) | paddw |
SE | add ⟨b6+b7,b4+b5,b2+b3,b0+b1, a6+a7,a4+a5,a2+a3,a0+a1⟩ |
__m128i _mm_hadd_epi16(__m128i,__m128i) | v8hi __builtin_ia32_phaddw128(v8hi,v8hi) | phaddw |
S2 | adds.s | __m128i _mm_adds_epi16(__m128i,__m128i) | v8hi __builtin_ia32_paddsw128(v8hi,v8hi) | paddsw |
S2 | adds.u | __m128i _mm_adds_epu16(__m128i,__m128i) | v8hi __builtin_ia32_paddusw128(v8hi,v8hi) | paddusw |
SE | adds ⟨b6+b7,b4+b5,b2+b3,b0+b1, a6+a7,a4+a5,a2+a3,a0+a1⟩ |
__m128i _mm_hadds_epi16(__m128i,__m128i) | v8hi __builtin_ia32_phaddsw128(v8hi,v8hi) | phaddsw |
S2 | average.u | __m128i _mm_avg_epu16(__m128i,__m128i) | v8hi __builtin_ia32_pavgw128(v8hi,v8hi) | pavgw |
sub | ||||
S2 | sub | __m128i _mm_sub_epi16(__m128i,__m128i) | v8hi __builtin_ia32_psubw128(v8hi,v8hi) | psubw |
SE | sub ⟨b6–b7,b4–b5,b2–b3,b0–b1, a6–a7,a4–a5,a2–a3,a0–a1⟩ |
__m128i _mm_hsub_epi16(__m128i,__m128i) | v8hi __builtin_ia32_phsubw128(v8hi,v8hi) | phsubw |
S2 | subs.s | __m128i _mm_subs_epi16(__m128i,__m128i) | v8hi __builtin_ia32_psubsw128(v8hi,v8hi) | psubsw |
S2 | subs.u | __m128i _mm_subs_epu16(__m128i,__m128i) | v8hi __builtin_ia32_psubusw128(v8hi,v8hi) | psubusw |
SE | subs.s ⟨b6–b7,b4–b5,b2–b3,b0–b1, a6–a7,a4–a5,a2–a3,a0–a1⟩ |
__m128i _mm_hsubs_epi16(__m128i,__m128i) | v8hi __builtin_ia32_phsubsw128(v8hi,v8hi) | phsubsw |
mul | ||||
S2 | mul(lo) | __m128i _mm_mullo_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pmullw128(v8hi,v8hi) | pmullw |
S2 | mul(hi).s | __m128i _mm_mulhi_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pmulhw128(v8hi,v8hi) | pmulhw |
S2 | mul(hi).u | __m128i _mm_mulhi_epu16(__m128i,__m128i) | v8hi __builtin_ia32_pmulhuw128(v8hi,v8hi) | pmulhuw |
SE | mul(≫ 15).s | __m128i _mm_mulhrs_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pmulhrsw128(v8hi,v8hi) | pmulhrsw |
SE | mul ai×sign(bi) | __m128i _mm_sign_epi16(__m128i,__m128i) | v8hi __builtin_ia32_psignw128(v8hi,v8hi) | psignw |
S2 | 4×scalprod.s unsaturated | __m128i _mm_madd_epi16(__m128i,__m128i) | v4si __builtin_ia32_pmaddwd128(v8hi,v8hi) | pmaddwd |
math | ||||
S2 | min.s | __m128i _mm_min_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pminsw128(v8hi,v8hi) | pminsw |
S4 | min.u | __m128i _mm_min_epu16(__m128i,__m128i) | v8hi __builtin_ia32_pminuw128(bhi,v8hi) | pminuw |
S4 | min.u{a7,…,a0} ⟨0,…,0,index,min⟩ |
__m128i _mm_minpos_epu16(__m128i) | v8hi __builtin_ia32_phminposuw128(v8hi) | phminposuw |
S2 | max.s | __m128i _mm_max_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pmaxsw128(v8hi,v8hi) | pmaxsw |
S4 | max.u | __m128i _mm_max_epu16(__m128i,__m128i) | v8hi __builtin_ia32_pmaxuw128(v8hi,v8hi) | pmaxuw |
SE | abs.s | __m128i _mm_abs_epi16(__m128i) | v8hi __builtin_ia32_pabsw128(v8hi) | pabsw |
logical | ||||
S2 | ∧ | __m128i _mm_and_si128(__m128i,__m128i) | v2di __builtin_ia32_pand128(v2di,v2di) | pand |
S2 | ¬∧ | __m128i _mm_andnot_si128(__m128i,__m128i) | v2di __builtin_ia32_pandn128(v2di,v2di) | pandn |
S2 | ∨ | __m128i _mm_or_si128(__m128i,__m128i) | v2di __builtin_ia32_por128(v2di,v2di) | por |
S2 | ⊕ | __m128i _mm_xor_si128(__m128i,__m128i) | v2di __builtin_ia32_pxor128(v2di,v2di) | pxor |
shift | ||||
S2 | shift left | __m128i _mm_sll_epi16(__m128i,__m128i count) | v8hi __builtin_ia32_psllw128(v8hi,v8hi) | psllw |
S2 | shift left imm | __m128i _mm_slli_epi16(__m128i,int imm8) | v8hi __builtin_ia32_psllwi128(v8hi,int) | psllw |
S2 | shift right.s | __m128i _mm_sra_epi16(__m128i,__m128i count) | v8hi __builtin_ia32_psraw128(v8hi,v8hi) | psraw |
S2 | shift right.s imm | __m128i _mm_srai_epi16(__m128i,int imm8) | v8hi __builtin_ia32_psrawi128(v8hi,int) | psraw |
S2 | shift right.u | __m128i _mm_srl_epi16(__m128i,__m128i count) | v8hi __builtin_ia32_psrlw128(v8hi,v8hi) | psrlw |
S2 | shift right.u imm | __m128i _mm_srli_epi16(__m128i,int imm8) | v8hi __builtin_ia32_psrlwi128(v8hi,int) | psrlw |
compare | ||||
S2 | = | __m128i _mm_cmpeq_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pcmpeqw128(v8hi,v8hi) | pcmpeqw |
S2 | > | __m128i _mm_cmpgt_epi16(__m128i,__m128i) | v8hi __builtin_ia32_pcmpgtw128(v8hi,v8hi) | pcmpgtw |
S2 | < | __m128i _mm_cmplt_epi16(__m128i,__m128i) | (argswap) | pcmpgtw |
reduce | ||||
S2 | reduce+saturate.s ⟨b[7-0],a[7-0]⟩ |
__m128i _mm_packs_epi16(__m128i,__m128i) | v16qi __builtin_ia32_packsswb128(v8hi,v8hi) | packsswb |
S2 | reduce+saturate.u ⟨b[7-0],a[7-0]⟩ |
__m128i _mm_packus_epi16(__m128i,__m128i) | v16qi __builtin_ia32_packuswb128(v8hi,v8hi) | packuswb |
shuffle | ||||
A2 | dup ⟨a0,…,a0⟩ | __m128i _mm_broadcastw_epi16(__m128i) | v8hi __builtin_ia32_pbroadcastw128(v8hi) | vpbroadcastw |
S2 | interleave ⟨b3,a2,b2,a2⟩ ⟨b1,a1,b0,a0⟩ |
__m128i _mm_unpacklo_epi16(__m128i,__m128i) | v8hi __builtin_ia32_punpcklwd128(v8hi,v8hi) | punpcklwd |
S2 | interleave ⟨b7,a7,b6,a6⟩ ⟨b5,a5,b4,a4⟩ |
__m128i _mm_unpackhi_epi16(__m128i,__m128i) | v8hi __builtin_ia32_punpckhwd128(v8hi,v8hi) | punpckhwd |
S4 | blend ⟨imm8i ? bi : ai⟩ | __m128i _mm_blend_epi16(__m128i,__m128i,const int imm8) | v8hi __builtin_ia32_pblendw128(v8hi,v8hi,int) | pblendw |
S2 | shuffle.lo from a3…a0 ⟨a7,a6,a5,a4,al,ak,aj,ai⟩ |
__m128i _mm_shufflelo_epi16(__m128i,int imm8) | v8hi __builtin_ia32_pshuflw(v8hi,int) | pshuflw |
S2 | shuffle.hi from a7…a4 ⟨al,ak,aj,ai,a3,a2,a1,a0⟩ |
__m128i _mm_shufflehi_epi16(__m128i,int imm8) | v8hi __builtin_ia32_pshufhw(v8hi,int) | pshufhw |
SSE(128 bit): int32 × 4
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
load | ||||
S2 | load.u ⟨0,0,0,m0⟩ | __m128i _mm_loadu_si32(void const*mem_addr) | v4si _mm_set_epi32(0,0,0,*(int const*)) | movd |
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m128i _mm_i32gather_epi32(int const*base_addr,__m128i vindex,const int scale) | v4si __builtin_ia32_gathersiv4si(0,int const*,__v4si,~0,int) | vpgatherdd |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m128i _mm_mask_i32gather_epi32(__m128i src,int const*base_addr,__m128i vindex,__m128i mask,const int scale) | v4si __builtin_ia32_gathersiv4si(v4si,int const*,__v4si,v4si,int) | vpgatherdd |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m128i _mm_i64gather_epi32(int const*base_addr,__m128i vindex,const int scale) | v4si __builtin_ia32_gatherdiv4si(0,int const*,__v2di,~0,int) | vpgatherqd |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m128i _mm_mask_i64gather_epi32(__m128i src,int const*base_addr,__m128i vindex,__m128i mask,const int scale) | v4si __builtin_ia32_gatherdiv4si(v4si,int const*,__v2di,v4si,int) | vpgatherqd |
A2 | load ⟨bi<0 ? mi : 0⟩ | __m128i _mm_maskload_epi32(int const*mem_addr,__m128i mask) | v4si __builtin_ia32_maskloadd(const v4si*,v4si) | vpmaskmovd |
store | ||||
S2 | store.u a0 | void _mm_storeu_si32(void*mem_addr,__m128i) | (assign) | movd |
A2 | store ⟨mi ≔ bi if ci<0⟩ | void _mm_maskstore_epi32(int*mem_addr,__m128i mask,__m128i) | void __builtin_ia32_maskstored(v4si*,v4si,v4si) | vpmaskmovd |
convert | ||||
S2 | create ⟨0,0,0,i32⟩ | __m128i _mm_cvtsi32_si128(int) | v4si _mm_set_epi32(0,0,0,int) | movd |
S4 | convert.s ← i8[3:0/16] | __m128i _mm_cvtepi8_epi32(__m128i) | v4si __builtin_ia32_pmovsxbd128(v16qi) | pmovsxbd |
S4 | convert.u ← u8[3:0/16] | __m128i _mm_cvtepu8_epi32(__m128i) | v4si __builtin_ia32_pmovzxbd128(v16qi) | pmovzxbd |
S4 | convert.s ← i16[3:0/8] | __m128i _mm_cvtepi16_epi32(__m128i) | v4si __builtin_ia32_pmovsxwd128(v8hi) | pmovsxwd |
S4 | convert.u ← u16[3:0/8] | __m128i _mm_cvtepu16_epi32(__m128i) | v4si __builtin_ia32_pmovzxwd128(v8hi) | pmovzxwd |
insert | ||||
S4 | insert ai i32 | __m128i _mm_insert_epi32(__m128i,int i,const int imm8) | v4si __builtin_ia32_vec_set_v4si(v4si,int,int) | pinsrd |
extract | ||||
S2 | extract a0 i32 | int _mm_cvtsi128_si32(__m128i) | (cast) | movd |
S4 | extract ai i32 | int _mm_extract_epi32(__m128i,const int imm8) | int __builtin_ia32_vec_ext_v4si(v4si,int,int) | pextrd |
add | ||||
S2 | add | __m128i _mm_add_epi32(__m128i,__m128i) | v4si __builtin_ia32_paddd128(v4si,v4si) | paddd |
SE | add ⟨b2+b3,b0+b1,a2+a3,a0+a1⟩ |
__m128i _mm_hadd_epi32(__m128i,__m128i) | v4si __builtin_ia32_phaddd128(v4si,v4si) | phaddd |
sub | ||||
S2 | sub | __m128i _mm_sub_epi32(__m128i,__m128i) | v4si __builtin_ia32_psubd128(v4si,v4si) | psubd |
SE | sub ⟨b2–b3,b0–b1,a2–a3,a0–a1⟩ |
__m128i _mm_hsub_epi32(__m128i,__m128i) | v4si __builtin_ia32_phsubd128(v4si,v4si) | phsubd |
mul | ||||
S4 | mul.s ⟨a2×b2,a0×b0⟩→i64 | __m128i _mm_mul_epi32(__m128i,__m128i) | v2di __builtin_ia32_pmuldq128(v4si,v4si) | pmuldq |
S2 | mul.u ⟨a2×b2,a0×b0⟩→i64 | __m128i _mm_mul_epu32(__m128i,__m128i) | v2di __builtin_ia32_pmuludq128(v4si,v4si) | pmuludq |
S4 | mul.lo | __m128i _mm_mullo_epi32(__m128i,__m128i) | (op) | pmulld |
SE | mul ai×sign(bi) | __m128i _mm_sign_epi32(__m128i,__m128i) | v4si __builtin_ia32_psignd128(v4si,v4si) | psignd |
math | ||||
S4 | min.s | __m128i _mm_min_epi32(__m128i,__m128i) | v4si __builtin_ia32_pminsd128(v4si,v4si) | pminsd |
S4 | min.u | __m128i _mm_min_epu32(__m128i,__m128i) | v4si __builtin_ia32_pminud128(v4si,v4si) | pminud |
S4 | max.s | __m128i _mm_max_epi32(__m128i,__m128i) | v4si __builtin_ia32_pmaxsd128(v4si,v4si) | pmaxsd |
S4 | max.u | __m128i _mm_max_epu32(__m128i,__m128i) | v4si __builtin_ia32_pmaxud128(v4si,v4si) | pmaxud |
SE | abs.s | __m128i _mm_abs_epi32(__m128i) | v4si __builtin_ia32_pabsd128(v4si) | pabsd |
logical | ||||
S2 | ∧ | __m128i _mm_and_si128(__m128i,__m128i) | v2di __builtin_ia32_pand128(v2di,v2di) | pand |
S2 | ¬∧ | __m128i _mm_andnot_si128(__m128i,__m128i) | v2di __builtin_ia32_pandn128(v2di,v2di) | pandn |
S2 | ∨ | __m128i _mm_or_si128(__m128i,__m128i) | v2di __builtin_ia32_por128(v2di,v2di) | por |
S2 | ⊕ | __m128i _mm_xor_si128(__m128i,__m128i) | v2di __builtin_ia32_pxor128(v2di,v2di) | pxor |
shift | ||||
S2 | shift left | __m128i _mm_sll_epi32(__m128i,__m128i count) | v4si __builtin_ia32_pslld128(v4si,v4si) | pslld |
S2 | shift left imm | __m128i _mm_slli_epi32(__m128i,int imm8) | v4si __builtin_ia32_pslldi128(v4si,int) | pslld |
A2 | shift left variable | __m128i _mm_sllv_epi32(__m128i,__m128i count) | v4si __builtin_ia32_psllv4si(v4si,v4si) | vpsllvd |
S2 | shift right.s | __m128i _mm_sra_epi32(__m128i,__m128i count) | v4si __builtin_ia32_psrad128(v4si,v4si) | psrad |
S2 | shift right.s imm | __m128i _mm_srai_epi32(__m128i,int imm8) | v4si __builtin_ia32_psradi128(v4si,int) | psrad |
A2 | shift right.s variable | __m128i _mm_srav_epi32(__m128i,__m128i count) | v4si __builtin_ia32_psrav4si(v4si,v4si) | vpsravd |
S2 | shift right.u | __m128i _mm_srl_epi32(__m128i,__m128i count) | v4si __builtin_ia32_psrld128(v4si,v4si) | psrld |
S2 | shift right.u imm | __m128i _mm_srli_epi32(__m128i,int imm8) | v4si __builtin_ia32_psrldi128(v4si,int) | psrld |
A2 | shift right.u variable | __m128i _mm_srlv_epi32(__m128i,__m128i count) | v4si __builtin_ia32_psrlv4si(v4si,v4si) | vpsrlvd |
compare | ||||
S2 | = | __m128i _mm_cmpeq_epi32(__m128i,__m128i) | v4si __builtin_ia32_pcmpeqd128(v4si,v4si) | pcmpeqd |
S2 | > | __m128i _mm_cmpgt_epi32(__m128i,__m128i) | v4si __builtin_ia32_pcmpgtd128(v4si,v4si) | pcmpgtd |
S2 | < | __m128i _mm_cmplt_epi32(__m128i,__m128i) | (argswap) | pcmpgtd |
reduce | ||||
S2 | reduce+saturate.s ⟨b3:0,a3:0⟩→i16 |
__m128i _mm_packs_epi32(__m128i,__m128i) | v8hi __builtin_ia32_packssdw128(v4si,v4si) | packssdw |
S4 | reduce+saturate.u ⟨b3:0,a3:0⟩→u16 |
__m128i _mm_packus_epi32(__m128i,__m128i) | v8hi __builtin_ia32_packusdw128(v4si,v4si) | packusdw |
shuffle | ||||
A2 | dup ⟨a0,a0,a0,a0⟩ | __m128i _mm_broadcastd_epi32(__m128i) | v4si __builtin_ia32_pbroadcastd128(v4si) | vpbroadcastd |
S2 | interleave ⟨b1,a1,b0,a0⟩ | __m128i _mm_unpacklo_epi32(__m128i,__m128i) | v4si __builtin_ia32_punpckldq128(v4si,v4si) | punpckldq |
S2 | interleave ⟨b3,a3,b2,a2⟩ | __m128i _mm_unpackhi_epi32(__m128i,__m128i) | v4si __builtin_ia32_punpckhdq128(v4si,v4si) | punpckhdq |
A2 | blend ⟨imm8i ? bi : ai⟩ | __m128i _mm_blend_epi32(__m128i,__m128i,const int imm8) | v4si __builtin_ia32_pblendd128(v4si,v4si,int) | vpblendd |
S2 | shuffle ⟨al,ak,aj,ai⟩ | __m128i _mm_shuffle_epi32(__m128i,int imm8) | v4si __builtin_ia32_pshufd(v4si,int) | pshufd |
SSE(128 bit): int64 × 2
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
load | ||||
S2 | load.a8 ⟨0,m0⟩ | __m128i _mm_loadl_epi64(__m128i const*mem_addr) | (cast) | movq |
S | load.u ⟨0,m0⟩ | __m128i _mm_loadu_si64(void const*mem_addr) | v4sf __builtin_ia32_loadups(float*) | movq |
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m128i _mm_i32gather_epi64(__int64 const*base_addr,__m128i vindex,const int scale) | v2di __builtin_ia32_gathersiv2di(0,int64 cont*,v4si,~0,int) | vpgatherdq |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m128i _mm_mask_i32gather_epi64(__m128i src,__int64 const*base_addr,__m128i vindex,__m128i mask,const int scale) | v2di __builtin_ia32_gathersiv2di(v2di,int64 cont*,v4si,v2di,int) | vpgatherdq |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m128i _mm_i64gather_epi64(__int64 const*base_addr,__m128i vindex,const int scale) | v2di __builtin_ia32_gatherdiv2di(0,int64 const*,v2di,~0,int) | vpgatherqq |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m128i _mm_mask_i64gather_epi64(__m128i src,__int64 const*base_addr,__m128i vindex,__m128i mask,const int scale) | v2di __builtin_ia32_gatherdiv2di(v2di,int64 const*,v2di,v2di,int) | vpgatherqq |
A2 | load ⟨bi<0 ? mi : 0⟩ | __m128i _mm_maskload_epi64(__int64 const*mem_addr,__m128i mask) | v2di __builtin_ia32_maskloadq(const v2di*,v2di) | vpmaskmovq |
store | ||||
S2 | store.a8 ⟨a0⟩ | void _mm_storel_epi64(__m128i*mem_addr,__m128i) | (assign) | movq |
S | store.u ⟨a0⟩ gnu falsch |
void _mm_storeu_si64(void*mem_addr,__m128i) | void __builtin_ia32_storeups(float*,v4sf) | movq |
A2 | store ⟨mi ≔ bi if ci<0⟩ | void _mm_maskstore_epi64(__int64*mem_addr,__m128i mask,__m128i) | void __builtin_ia32_maskstoreq(v2di*,v2di,v2di) | vpmaskmovq |
convert | ||||
S2 | create ⟨0,i64⟩ | __m128i _mm_cvtsi64_si128(__int64) __m128i _mm_cvtsi64x_si128(__int64) |
(cast) | movq |
S2 | create ⟨0,i64[0]⟩ | __m128i _mm_movpi64_epi64(__m64) | (cast) | movq2dq |
S4 | convert.s ← i8[1:0/16] | __m128i _mm_cvtepi8_epi64(__m128i) | v2di __builtin_ia32_pmovsxbq128(v16qi) | pmovsxbq |
S4 | convert.u ← u8[1:0/16] | __m128i _mm_cvtepu8_epi64(__m128i) | v2di __builtin_ia32_pmovzxbq128(v16qi) | pmovzxbq |
S4 | convert.s ← i16[1:0/8] | __m128i _mm_cvtepi16_epi64(__m128i) | v2di __builtin_ia32_pmovsxwq128(v8hi) | pmovsxwq |
S4 | convert.u ← u16[1:0/8] | __m128i _mm_cvtepu16_epi64(__m128i) | v2di __builtin_ia32_pmovzxwq128(v8hi) | pmovzxwq |
S4 | convert.s ← i32[1:0/4] | __m128i _mm_cvtepi32_epi64(__m128i) | v2di __builtin_ia32_pmovsxdq128(v4si) | pmovsxdq |
S4 | convert.u ← u32[1:0/4] | __m128i _mm_cvtepu32_epi64(__m128i) | v2di __builtin_ia32_pmovzxdq128(v4si) | pmovzxdq |
insert | ||||
S4 | insert ai i64 | __m128i _mm_insert_epi64(__m128i,__int64 i,const int imm8) | v2di __builtin_ia32_vec_set_v2di(v2di,int64,int) | pinsrq |
extract | ||||
S2 | extract a0 i64 | __int64 _mm_cvtsi128_si64(__m128i) __int64 _mm_cvtsi128_si64x(__m128i) |
(cast) | movq |
S2 | extract a0 i64[1] | __m64 _mm_movepi64_pi64(__m128i) | (cast) | movdq2q |
S2 | extract ⟨0,a0⟩ i64[2] | __m128i _mm_move_epi64(__m128i) | v2di __builtin_ia32_movq128(v2di) | movq |
S4 | extract ai i64 | __int64 _mm_extract_epi64(__m128i,const int imm8) | int64 __builtin_ia32_vec_ext_v2di(v2di,int) | pextrq |
arithmetic | ||||
S2 | add | __m128i _mm_add_epi64(__m128i,__m128i) | v2di __builtin_ia32_paddq(v2di,v2di) v2di __builtin_ia32_paddq128(v2di,v2di) |
paddq |
S2 | sub | __m128i _mm_sub_epi64(__m128i,__m128i) | v2di __builtin_ia32_psubq(v2di,v2di) v2di __builtin_ia32_psubq128(v2di,v2di) |
psubq |
logical | ||||
S2 | ∧ | __m128i _mm_and_si128(__m128i,__m128i) | v2di __builtin_ia32_pand128(v2di,v2di) | pand |
S2 | ¬∧ | __m128i _mm_andnot_si128(__m128i,__m128i) | v2di __builtin_ia32_pandn128(v2di,v2di) | pandn |
S2 | ∨ | __m128i _mm_or_si128(__m128i,__m128i) | v2di __builtin_ia32_por128(v2di,v2di) | por |
S2 | ⊕ | __m128i _mm_xor_si128(__m128i,__m128i) | v2di __builtin_ia32_pxor128(v2di,v2di) | pxor |
shift | ||||
S2 | shift left | __m128i _mm_sll_epi64(__m128i,__m128i count) | v2di __builtin_ia32_psllq128(v2di,v2di) | psllq |
S2 | shift left imm | __m128i _mm_slli_epi64(__m128i,int imm8) | v2di __builtin_ia32_psllqi128(v2di,int) v2di __builtin_ia32_pslldqi128(v2di,int) |
psllq |
A2 | shift left variable | __m128i _mm_sllv_epi64(__m128i,__m128i count) | v2di __builtin_ia32_vpsllvq128(v2di,v2di) | vpsllvq |
S2 | shift right.u | __m128i _mm_srl_epi64(__m128i,__m128i count) | v2di __builtin_ia32_psrlq128(v2di,v2di) | psrlq |
S2 | shift right.u imm | __m128i _mm_srli_epi64(__m128i,int imm8) | v2di __builtin_ia32_psrlqi128(v2di,int) v2di __builtin_ia32_psrldqi128(v2di,int) |
psrlq |
A2 | shift right.u variable | __m128i _mm_srlv_epi64(__m128i,__m128i count) | v2di __builtin_ia32_vpsrlvq128(v2di,v2di) | vpsrlvq |
compare | ||||
S4 | = | __m128i _mm_cmpeq_epi64(__m128i,__m128i) | (==) | pcmpeqq |
shuffle | ||||
A2 | dup ⟨a0,a0⟩ | __m128i _mm_broadcastq_epi64(__m128i) | v2di __builtin_ia32_pbroadcastq128(v2di) | vpbroadcastq |
S2 | interleave ⟨b0,a0⟩ | __m128i _mm_unpacklo_epi64(__m128i,__m128i) | v2di __builtin_ia32_punpcklqdq128(v2di,v2di) | punpcklqdq |
S2 | interleave ⟨b1,a1⟩ | __m128i _mm_unpackhi_epi64(__m128i,__m128i) | v2di __builtin_ia32_punpckhqdq128(v2di,v2di) | punpckhqdq |
SSE(128 bit): int128
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
S2 | const(0) | __m128i _mm_setzero_si128() | (cast) | pxor |
S2 | const(NaN) | __m128i _mm_undefined_si128(void) | (cast) | - |
load | ||||
S2 | load.a16 | __m128i _mm_load_si128(__m128i const*mem_addr) | (assign) | movdqa |
S4 | load.a16.nt | __m128i _mm_stream_load_si128(__m128i * mem_addr) | v2di __builtin_ia32_movntdqa(v2di*) | movntdqa |
S3 | load.u | __m128i _mm_lddqu_si128(__m128i const*mem_addr) | v16qi __builtin_ia32_lddqu(char const*) | lddqu |
store | ||||
S2 | store.a16 | void _mm_store_si128(__m128i*mem_addr,__m128i) | (assign) | movdqa |
S2 | store.a16.nt | void _mm_stream_si128(__m128i*mem_addr,__m128i) | void __builtin_ia32_movntdq(v2di*,v2di) | movntdq |
logical | ||||
S2 | ∧ | __m128i _mm_and_si128(__m128i,__m128i) | v2di __builtin_ia32_pand128(v2di,v2di) | pand |
S2 | ¬∧ | __m128i _mm_andnot_si128(__m128i,__m128i) | v2di __builtin_ia32_pandn128(v2di,v2di) | pandn |
S2 | ∨ | __m128i _mm_or_si128(__m128i,__m128i) | v2di __builtin_ia32_por128(v2di,v2di) | por |
S2 | ⊕ | __m128i _mm_xor_si128(__m128i,__m128i) | v2di __builtin_ia32_pxor128(v2di,v2di) | pxor |
shift | ||||
S2 | shift bytes left | __m128i _mm_slli_si128(__m128i,int imm8) __m128i _mm_bslli_si128(__m128i,int imm8) |
(op) | pslldq |
S2 | shift bytes right.u | __m128i _mm_srli_si128(__m128i,int imm8) __m128i _mm_bsrli_si128(__m128i,int imm8) |
(op) | psrldq |
compare | ||||
S4 | a≊0 (masked by b) | int _mm_testz_si128(__m128i,__m128i) | int __builtin_ia32_ptestz128(v2di,v2di) | ptest |
S4 | a≊1 (masked by b) | int _mm_testc_si128(__m128i,__m128i) | int __builtin_ia32_ptestc128(v2di,v2di) | ptest |
S4 | a≇0 ∧ a≇1 (masked by b) | int _mm_testnzc_si128(__m128i,__m128i) | int __builtin_ia32_ptestnzc128(v2di,v2di) | ptest |
S4 | a≊0 (masked by b) | int _mm_test_all_zeros(__m128i,__m128i mask) | int __builtin_ia32_ptestz128(v2di,v2di) | ptest |
S4 | a≇0 ∧ a≇1 (masked by b) | int _mm_test_mix_ones_zeros(__m128i,__m128i mask) | int __builtin_ia32_ptestnzc128(v2di,v2di) | ptest |
SSE(128 bit): float × 4
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
S | const(0) | __m128 _mm_setzero_ps(void) | (cast) | xorps |
S | const(NaN) | __m128 _mm_undefined_ps(void) | (cast) | - |
load | ||||
S | load.a16 | __m128 _mm_load_ps(float const*mem_addr) | v4sf __builtin_ia32_loadaps(float*) | movaps |
S | load.u | __m128 _mm_loadu_ps(float const*mem_addr) | v4sf __builtin_ia32_loadups(float*) | movups |
S | load.u ⟨0,0,0,m0⟩ | __m128 _mm_load_ss(float const*mem_addr) | v4sf __builtin_ia32_loadsss(float*) | movss |
A | load.u ⟨m0,m0,m0,m0⟩ | __m128 _mm_broadcast_ss(float const*mem_addr) | v4sf __builtin_ia32_vbroadcastss(pcfloat) | vbroadcastss |
S | load.u ⟨a3,a2,m1,m0⟩ | __m128 _mm_loadl_pi(__m128,__m64 const*mem_addr) | v4sf __builtin_ia32_loadlps(v4sf,const v2sf*) | movlps |
S | load.u ⟨m1,m0,a1,a0⟩ | __m128 _mm_loadh_pi(__m128,__m64 const*mem_addr) | v4sf __builtin_ia32_loadhps(v4sf,const v2sf*) | movhps |
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m128 _mm_i32gather_ps(float const*base_addr,__m128i vindex,const int scale) | v4sf __builtin_ia32_gathersiv4sf(0,float const*,v4si,~0,int) | vgatherdps |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m128 _mm_mask_i32gather_ps(__m128 src,float const*base_addr,__m128i vindex,__m128 mask,const int scale) | v4sf __builtin_ia32_gathersiv4sf(v4sf,float const*,v4si,v4si,int) | vgatherdps |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m128 _mm_i64gather_ps(float const*base_addr,__m128i vindex,const int scale) | v4sf __builtin_ia32_gatherdiv4sf(0,float const*,v2di,~0,int) | vgatherqps |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m128 _mm_mask_i64gather_ps(__m128 src,float const*base_addr,__m128i vindex,__m128 mask,const int scale) | v4sf __builtin_ia32_gatherdiv4sf(v4sf,float const*,v2di,v4sf,int) | vgatherqps |
A | load ⟨bi<0 ? mi : 0⟩ | __m128 _mm_maskload_ps(float const*mem_addr,__m128i mask) | v4sf __builtin_ia32_maskloadps(pcv4sf,v4sf) | vmaskmovps |
store | ||||
S | store.a16 | void _mm_store_ps(float*mem_addr,__m128) | void __builtin_ia32_storeaps(float*,v4sf) | movaps |
S | store.a16.nt | void _mm_stream_ps(float*mem_addr,__m128) | void __builtin_ia32_movntps(float*,v4sf) | movntps |
S | store.u | void _mm_storeu_ps(float*mem_addr,__m128) | void __builtin_ia32_storeups(float*,v4sf) | movups |
S | store.u ⟨a0⟩ | void _mm_store_ss(float*mem_addr,__m128) | void __builtin_ia32_storess(float*,v4sf) | movss |
S | store.u ⟨a1,a0⟩ | void _mm_storel_pi(__m64*mem_addr,__m128) | void __builtin_ia32_storelps(v2sf*,v4sf) | movlps |
S | store.u ⟨a3,a2⟩ | void _mm_storeh_pi(__m64*mem_addr,__m128) | void __builtin_ia32_storehps(v2sf*,v4sf) | movhps |
A | store ⟨mi ≔ bi if ci<0⟩ | void _mm_maskstore_ps(float*mem_addr,__m128i mask,__m128) | void __builtin_ia32_maskstoreps(pv4sf,v4sf,v4sf) | vmaskmovps |
insert | ||||
S | insert a0 i32 | __m128 _mm_cvt_si2ss(__m128,int) __m128 _mm_cvtsi32_ss(__m128,int) |
v4sf __builtin_ia32_cvtsi2ss(v4sf,int) | cvtsi2ss |
S | insert a0 i64 | __m128 _mm_cvtsi64_ss(__m128,__int64) | v4sf __builtin_ia32_cvtsi642ss(v4sf,int64) | cvtsi2ss |
S | insert a1:0 i32[2] | __m128 _mm_cvt_pi2ps(__m128,__m64) __m128 _mm_cvtpi32_ps(__m128,__m64) |
v4sf __builtin_ia32_cvtpi2ps(v4sf,v2si) | cvtpi2ps |
S | insert a0 f32[0/4] | __m128 _mm_move_ss(__m128,__m128) | v4sf __builtin_ia32_movss(v4sf,v4sf) | movss |
S2 | insert a0 f64[0/2] | __m128 _mm_cvtsd_ss(__m128,__m128d) | v4sf __builtin_ia32_cvtsd2ss(v4sf,v2df) | cvtsd2ss |
S4 | insert ak≔bl, ai≔0 if jii | __m128 _mm_insert_ps(__m128,__m128,const int imm8) | v4sf __builtin_ia32_insertps128(v4sf,v4sf,int) | insertps |
extract | ||||
S | extract a0 i32 | int _mm_cvt_ss2si(__m128) int _mm_cvtss_si32(__m128) |
int __builtin_ia32_cvtss2si(v4sf) | cvtss2si |
S | extract a0 i64 | __int64 _mm_cvtss_si64(__m128) | (cast) | cvtss2si |
S | extract a0 f32 | float _mm_cvtss_f32(__m128) | (cast) | movss |
S4 | extract ai
f32 ⚠returns float bits |
int _mm_extract_ps(__m128,const int imm8) | int __builtin_ia32_vec_ext_v4sf(v4sf,int) | extractps |
S | extract a1:0 i32[2] | __m64 _mm_cvt_ps2pi(__m128) __m64 _mm_cvtps_pi32(__m128) |
v2si __builtin_ia32_cvtps2pi(v4sf) | cvtps2pi |
S | extract.t a0 i32 | int _mm_cvtt_ss2si(__m128) int _mm_cvttss_si32(__m128) |
int __builtin_ia32_cvttss2si(v4sf) | cvttss2si |
S | extract.t a0 i64 | __int64 _mm_cvttss_si64(__m128) | (cast) | cvttss2si |
S | extract.t a1:0 i32[2] | __m64 _mm_cvtt_ps2pi(__m128) __m64 _mm_cvttps_pi32(__m128) |
v2si __builtin_ia32_cvttps2pi(v4sf) | cvttps2pi |
S | extract signbits | int _mm_movemask_ps(__m128) | int __builtin_ia32_movmskps(v4sf) | movmskps |
add | ||||
S | add | __m128 _mm_add_ps(__m128,__m128) | v4sf __builtin_ia32_addps(v4sf,v4sf) | addps |
S | add[0] | __m128 _mm_add_ss(__m128,__m128) | v4sf __builtin_ia32_addss(v4sf,v4sf) | addss |
S3 | add ⟨b2+b3,b0+b1,a2+a3,a0+a1⟩ |
__m128 _mm_hadd_ps(__m128,__m128) | v4sf __builtin_ia32_haddps(v4sf,v4sf) | haddps |
sub | ||||
S | sub | __m128 _mm_sub_ps(__m128,__m128) | v4sf __builtin_ia32_subps(v4sf,v4sf) | subps |
S | sub[0] | __m128 _mm_sub_ss(__m128,__m128) | v4sf __builtin_ia32_subss(v4sf,v4sf) | subss |
S3 | sub ⟨b2–b3,b0–b1,a2–a3,a0–a1⟩ |
__m128 _mm_hsub_ps(__m128,__m128) | v4sf __builtin_ia32_hsubps(v4sf,v4sf) | hsubps |
S3 | sub @even, add @odd | __m128 _mm_addsub_ps(__m128,__m128) | v4sf __builtin_ia32_addsubps(v4sf,v4sf) | addsubps |
mul | ||||
S | mul | __m128 _mm_mul_ps(__m128,__m128) | v4sf __builtin_ia32_mulps(v4sf,v4sf) | mulps |
S | mul[0] | __m128 _mm_mul_ss(__m128,__m128) | v4sf __builtin_ia32_mulss(v4sf,v4sf) | mulss |
S4 | scalprod ∑(ai×bi×{1,0}),di∊{∑,0} | __m128 _mm_dp_ps(__m128,__m128,const int imm8) | v4sf __builtin_ia32_dpps(v4sf,v4sf) | dpps |
div | ||||
S | div | __m128 _mm_div_ps(__m128,__m128) | v4sf __builtin_ia32_divps(v4sf,v4sf) | divps |
S | div[0] | __m128 _mm_div_ss(__m128,__m128) | v4sf __builtin_ia32_divss(v4sf,v4sf) | divss |
math | ||||
S | min | __m128 _mm_min_ps(__m128,__m128) | v4sf __builtin_ia32_minps(v4sf,v4sf) | minps |
S | min [0] | __m128 _mm_min_ss(__m128,__m128) | v4sf __builtin_ia32_minss(v4sf,v4sf) | minss |
S | max | __m128 _mm_max_ps(__m128,__m128) | v4sf __builtin_ia32_maxps(v4sf,v4sf) | maxps |
S | max [0] | __m128 _mm_max_ss(__m128,__m128) | v4sf __builtin_ia32_maxss(v4sf,v4sf) | maxss |
S4 | floor() | __m128 _mm_floor_ps(__m128) | v4sf mm_round_ps(v4sf,_MM_FROUND_FLOOR) | roundps |
S4 | floor() [0] | __m128 _mm_floor_ss(__m128,__m128) | v4sf mm_round_ss(v4sf,_MM_FROUND_FLOOR) | roundss |
S4 | ceil() | __m128 _mm_ceil_ps(__m128) | v4sf mm_round_ps(v4sf,_MM_FROUND_CEIL) | roundps |
S4 | ceil() [0] | __m128 _mm_ceil_ss(__m128,__m128) | v4sf mm_round_ss(v4sf,_MM_FROUND_CEIL) | roundss |
S4 | round() | __m128 _mm_round_ps(__m128,int rounding) | v4sf mm_round_ps(v4sf,int) | roundps |
S4 | round() [0] | __m128 _mm_round_ss(__m128,__m128,int rounding) | v4sf mm_round_ss(v4sf,int) | roundss |
S | 1/x | __m128 _mm_rcp_ps(__m128) | v4sf __builtin_ia32_rcpps(v4sf) | rcpps |
S | 1/x [0] | __m128 _mm_rcp_ss(__m128) | v4sf __builtin_ia32_rcpss(v4sf) | rcpss |
S | √x | __m128 _mm_sqrt_ps(__m128) | v4sf __builtin_ia32_sqrtps(v4sf) | sqrtps |
S | √x [0] | __m128 _mm_sqrt_ss(__m128) | v4sf __builtin_ia32_sqrtss(v4sf) | sqrtss |
S | 1/√x | __m128 _mm_rsqrt_ps(__m128) | v4sf __builtin_ia32_rsqrtps(v4sf) | rsqrtps |
S | 1/√x [0] | __m128 _mm_rsqrt_ss(__m128) | v4sf __builtin_ia32_rsqrtss(v4sf) | rsqrtss |
logical | ||||
S | ∧ | __m128 _mm_and_ps(__m128,__m128) | v4sf __builtin_ia32_andps(v4sf,v4sf) | andps |
S | ¬∧ | __m128 _mm_andnot_ps(__m128,__m128) | v4sf __builtin_ia32_andnps(v4sf,v4sf) | andnps |
S | ∨ | __m128 _mm_or_ps(__m128,__m128) | v4sf __builtin_ia32_orps(v4sf,v4sf) | orps |
S | ⊕ | __m128 _mm_xor_ps(__m128,__m128) | v4sf __builtin_ia32_xorps(v4sf,v4sf) | xorps |
compare | ||||
S | = | __m128 _mm_cmpeq_ps(__m128,__m128) | v4si __builtin_ia32_cmpeqps(v4sf,v4sf) | cmpps |
S | = [0] | __m128 _mm_cmpeq_ss(__m128,__m128) | v4si __builtin_ia32_cmpeqss(v4sf,v4sf) | cmpss |
S | = [0] c | int _mm_comieq_ss(__m128,__m128) | int __builtin_ia32_comieq(v4sf,v4sf) | comiss |
S | = [0] u | int _mm_ucomieq_ss(__m128,__m128) | int __builtin_ia32_ucomieq(v4sf,v4sf) | ucomiss |
S | ≥ | __m128 _mm_cmpge_ps(__m128,__m128) | v4si __builtin_ia32_cmpgeps(v4sf,v4sf) | cmpps |
S | ≥ [0] | __m128 _mm_cmpge_ss(__m128,__m128) | (argswap) | cmpss |
S | ≥ [0] c | int _mm_comige_ss(__m128,__m128) | int __builtin_ia32_comige(v4sf,v4sf) | comiss |
S | ≥ [0] u | int _mm_ucomige_ss(__m128,__m128) | int __builtin_ia32_ucomige(v4sf,v4sf) | ucomiss |
S | > | __m128 _mm_cmpgt_ps(__m128,__m128) | v4si __builtin_ia32_cmpgtps(v4sf,v4sf) | cmpps |
S | > [0] | __m128 _mm_cmpgt_ss(__m128,__m128) | (argswap) | cmpss |
S | > [0] c | int _mm_comigt_ss(__m128,__m128) | int __builtin_ia32_comigt(v4sf,v4sf) | comiss |
S | > [0] u | int _mm_ucomigt_ss(__m128,__m128) | int __builtin_ia32_ucomigt(v4sf,v4sf) | ucomiss |
S | ≤ | __m128 _mm_cmple_ps(__m128,__m128) | v4si __builtin_ia32_cmpleps(v4sf,v4sf) | cmpps |
S | ≤ [0] | __m128 _mm_cmple_ss(__m128,__m128) | v4si __builtin_ia32_cmpless(v4sf,v4sf) | cmpss |
S | ≤ [0] c | int _mm_comile_ss(__m128,__m128) | int __builtin_ia32_comile(v4sf,v4sf) | comiss |
S | ≤ [0] u | int _mm_ucomile_ss(__m128,__m128) | int __builtin_ia32_ucomile(v4sf,v4sf) | ucomiss |
S | < | __m128 _mm_cmplt_ps(__m128,__m128) | v4si __builtin_ia32_cmpltps(v4sf,v4sf) | cmpps |
S | < [0] | __m128 _mm_cmplt_ss(__m128,__m128) | v4si __builtin_ia32_cmpltss(v4sf,v4sf) | cmpss |
S | < [0] c | int _mm_comilt_ss(__m128,__m128) | int __builtin_ia32_comilt(v4sf,v4sf) | comiss |
S | < [0] u | int _mm_ucomilt_ss(__m128,__m128) | int __builtin_ia32_ucomilt(v4sf,v4sf) | ucomiss |
S | ≠ | __m128 _mm_cmpneq_ps(__m128,__m128) | v4si __builtin_ia32_cmpneqps(v4sf,v4sf) | cmpps |
S | ≠ [0] | __m128 _mm_cmpneq_ss(__m128,__m128) | v4si __builtin_ia32_cmpneqss(v4sf,v4sf) | cmpss |
S | ≠ [0] c | int _mm_comineq_ss(__m128,__m128) | int __builtin_ia32_comineq(v4sf,v4sf) | comiss |
S | ≠ [0] u | int _mm_ucomineq_ss(__m128,__m128) | int __builtin_ia32_ucomineq(v4sf,v4sf) | ucomiss |
S | ≱ | __m128 _mm_cmpnge_ps(__m128,__m128) | v4si __builtin_ia32_cmpngeps(v4sf,v4sf) | cmpps |
S | ≱ [0] | __m128 _mm_cmpnge_ss(__m128,__m128) | (argswap) | cmpss |
S | ≯ | __m128 _mm_cmpngt_ps(__m128,__m128) | v4si __builtin_ia32_cmpngtps(v4sf,v4sf) | cmpps |
S | ≯ [0] | __m128 _mm_cmpngt_ss(__m128,__m128) | (argswap) | cmpss |
S | ≰ | __m128 _mm_cmpnle_ps(__m128,__m128) | v4si __builtin_ia32_cmpnleps(v4sf,v4sf) | cmpps |
S | ≰ [0] | __m128 _mm_cmpnle_ss(__m128,__m128) | v4si __builtin_ia32_cmpnless(v4sf,v4sf) | cmpss |
S | ≮ | __m128 _mm_cmpnlt_ps(__m128,__m128) | v4si __builtin_ia32_cmpnltps(v4sf,v4sf) | cmpps |
S | ≮ [0] | __m128 _mm_cmpnlt_ss(__m128,__m128) | v4si __builtin_ia32_cmpnltss(v4sf,v4sf) | cmpss |
S | ≶ | __m128 _mm_cmpord_ps(__m128,__m128) | v4si __builtin_ia32_cmpordps(v4sf,v4sf) | cmpps |
S | ≶ [0] | __m128 _mm_cmpord_ss(__m128,__m128) | v4si __builtin_ia32_cmpordss(v4sf,v4sf) | cmpss |
S | ≸ | __m128 _mm_cmpunord_ps(__m128,__m128) | v4si __builtin_ia32_cmpunordps(v4sf,v4sf) | cmpps |
S | ≸ [0] | __m128 _mm_cmpunord_ss(__m128,__m128) | v4si __builtin_ia32_cmpunordss(v4sf,v4sf) | cmpss |
A | compare(op) | __m128 _mm_cmp_ps(__m128,__m128,const int imm8) | v4sf __builtin_ia32_cmpps(v4sf,v4sf,int) | vcmpps |
A | compare(op) [0] | __m128 _mm_cmp_ss(__m128,__m128,const int imm8) | v4sf __builtin_ia32_cmpss(v4sf,v4sf,int) | vcmpss |
A | a≊0 (masked by b) | int _mm_testz_ps(__m128,__m128) | int __builtin_ia32_vtestzps(v4sf,v4sf) | vtestps |
A | a≊1 (masked by b) | int _mm_testc_ps(__m128,__m128) | int __builtin_ia32_vtestcps(v4sf,v4sf) | vtestps |
A | a≇0 ∧ a≇1 (masked by b) | int _mm_testnzc_ps(__m128,__m128) | int __builtin_ia32_vtestnzcps(v4sf,v4sf) | vtestps |
convert | ||||
S2 | convert ← i32[4] | __m128 _mm_cvtepi32_ps(__m128i) | v4sf __builtin_ia32_cvtdq2ps(v4si) | cvtdq2ps |
S2 | convert → i32[4] | __m128i _mm_cvtps_epi32(__m128) | v4si __builtin_ia32_cvtps2dq(v4sf) | cvtps2dq |
S2 | convert.t→ i32[4] | __m128i _mm_cvttps_epi32(__m128) | v4si __builtin_ia32_cvttps2dq(v4sf) | cvttps2dq |
shuffle | ||||
A2 | dup ⟨a0,…,a0⟩ | __m128 _mm_broadcastss_ps(__m128) | v4sf __builtin_ia32_vbroadcastss_ps(v4sf) | vbroadcastss |
S3 | dup ⟨a2,a2,a0,a0⟩ | __m128 _mm_moveldup_ps(__m128) | v4sf __builtin_ia32_movsldup(v4sf) | movsldup |
S3 | dup ⟨a3,a3,a1,a1⟩ | __m128 _mm_movehdup_ps(__m128) | v4sf __builtin_ia32_movshdup(v4sf) | movshdup |
S | interleave ⟨b1,a1,b0,a0⟩ | __m128 _mm_unpacklo_ps(__m128,__m128) | v4sf __builtin_ia32_unpcklps(v4sf,v4sf) | unpcklps |
S | interleave ⟨b3,a3,b2,a2⟩ | __m128 _mm_unpackhi_ps(__m128,__m128) | v4sf __builtin_ia32_unpckhps(v4sf,v4sf) | unpckhps |
S | interleave ⟨b1,b0,a1,a0⟩ | __m128 _mm_movelh_ps(__m128,__m128) | v4sf __builtin_ia32_movlhps(v4sf,v4sf) | movlhps |
S | interleave ⟨b3,b2,a3,a2⟩ | __m128 _mm_movehl_ps(__m128,__m128) | v4sf __builtin_ia32_movhlps(v4sf,v4sf) | movhlps |
S4 | blend ⟨imm8i ? bi : ai⟩ | __m128 _mm_blend_ps(__m128,__m128,const int imm8) | v4sf __builtin_ia32_blendps(v4sf,v4sf,int) | blendps |
S4 | blend ⟨ci<0 ? bi : ai⟩ | __m128 _mm_blendv_ps(__m128,__m128,__m128 mask) | v4sf __builtin_ia32_blendvps(v4sf,v4sf,v4sf) | blendvps |
S | shuffle ⟨bl,bk,aj,ai⟩ | __m128 _mm_shuffle_ps(__m128,__m128,unsigned int imm8) | v4sf __builtin_ia32_shufps(v4sf,v4sf,int) | shufps |
A | shuffle ⟨al,ak,aj,ai⟩ | __m128 _mm_permute_ps(__m128,int imm8) | v4sf __builtin_ia32_vpermilps(v4sf,int) | vpermilps |
A | shuffle ⟨ab3,ab2,ab1,ab0⟩ | __m128 _mm_permutevar_ps(__m128,__m128i) | v4sf __builtin_ia32_vpermilvarps(v4sf,v4si) | vpermilps |
SSE(128 bit) double × 2
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
S2 | const(0) | __m128d _mm_setzero_pd(void) | (cast) | xorpd |
S2 | const(NaN) | __m128d _mm_undefined_pd(void) | (cast) | - |
load | ||||
S2 | load.a16 | __m128d _mm_load_pd(double const*mem_addr) | (assign) | movapd |
S2 | load.u | __m128d _mm_loadu_pd(double const*mem_addr) | v2df __builtin_ia32_loadupd(double*) | movupd |
S2 | load.u ⟨0,m0⟩ | __m128d _mm_load_sd(double const*mem_addr) | (assign) | movsd |
S3 | load ⟨m0,m0⟩ | __m128d _mm_loaddup_pd(double const*mem_addr) | v2df __builtin_ia32_loadddup(double const*) | movddup |
S2 | load.u ⟨a1,m0⟩ | __m128d _mm_loadl_pd(__m128d,double const*mem_addr) | v2df __builtin_ia32_loadlpd(v2df,double const*) | movlpd |
S2 | load.u ⟨m0,a0⟩ | __m128d _mm_loadh_pd(__m128d,double const*mem_addr) | v2df __builtin_ia32_loadhpd(v2df,double const*) | movhpd |
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m128d _mm_i32gather_pd(double const*base_addr,__m128i vindex,const int scale) | v2df __builtin_ia32_gathersiv2df(0,double const*,v4si,~0,int) | vgatherdpd |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m128d _mm_mask_i32gather_pd(__m128d src,double const*base_addr,__m128i vindex,__m128d mask,const int scale) | v2df __builtin_ia32_gathersiv2df(v2df,double const*,v4si,v2df,int) | vgatherdpd |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m128d _mm_i64gather_pd(double const*base_addr,__m128i vindex,const int scale) | v2df __builtin_ia32_gatherdiv2df(0,double const*,v2di,~0,int) | vgatherqpd |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m128d _mm_mask_i64gather_pd(__m128d src,double const*base_addr,__m128i vindex,__m128d mask,const int scale) | v2df __builtin_ia32_gatherdiv2df(v2df,double const*,v2di,v2df,int) | vgatherqpd |
A | load ⟨bi<0 ? mi : 0⟩ | __m128d _mm_maskload_pd(double const*mem_addr,__m128i mask) | v2df __builtin_ia32_maskloadpd(pcv2df,v2df) | vmaskmovpd |
store | ||||
S2 | store.a16 | void _mm_store_pd(double*mem_addr,__m128d) | (assign) | movapd |
S2 | store.a16.nt | void _mm_stream_pd(double*mem_addr,__m128d) | void __builtin_ia32_movntpd(double*,v2df) void __builtin_ia32_movntdq(v2df*,v2df) |
movntpd |
S2 | store.u | void _mm_storeu_pd(double*mem_addr,__m128d) | void __builtin_ia32_storeupd(double*,v2df) | movupd |
S2 | store.u a0 | void _mm_store_sd(double*mem_addr,__m128d) | *(double*)=v2df[0] | movsd |
S2 | store.a a0 | void _mm_storel_pd(double*mem_addr,__m128d) | (assign) | movlpd |
S2 | store.a a1 | void _mm_storeh_pd(double*mem_addr,__m128d) | (assign) | movhpd |
A | store ⟨mi ≔ bi if ci<0⟩ | void _mm_maskstore_pd(double*mem_addr,__m128i mask,__m128d) | void __builtin_ia32_maskstorepd(pv2df,v2df,v2df) | vmaskmovpd |
convert | ||||
S2 | create ← i32[2] | __m128d _mm_cvtpi32_pd(__m64) | v2df __builtin_ia32_cvtpi2pd(v2si) | cvtpi2pd |
S2 | create ← i32[0:1/4] | __m128d _mm_cvtepi32_pd(__m128i) | v2df __builtin_ia32_cvtdq2pd(v4si) | cvtdq2pd |
S2 | create ← f32[0:1/4] | __m128d _mm_cvtps_pd(__m128) | v2df __builtin_ia32_cvtps2pd(v4sf) | cvtps2pd |
S2 | convert → i32[2] | __m64 _mm_cvtpd_pi32(__m128d) | v2si __builtin_ia32_cvtpd2pi(v2df) | cvtpd2pi |
S2 | convert.t → i32[2] | __m64 _mm_cvttpd_pi32(__m128d) | v2si __builtin_ia32_cvttpd2pi(v2df) | cvttpd2pi |
S2 | convert → ⟨0,0,a1,a0⟩ i32[4] | __m128i _mm_cvtpd_epi32(__m128d) | v4si __builtin_ia32_cvtpd2dq(v2df) | cvtpd2dq |
S2 | convert → ⟨0,0,a1,a0⟩ f32[4] | __m128 _mm_cvtpd_ps(__m128d) | v4sf __builtin_ia32_cvtpd2ps(v2df) | cvtpd2ps |
S2 | convert.t → ⟨0,0,a1,a0⟩ i32[4] | __m128i _mm_cvttpd_epi32(__m128d) | v4si __builtin_ia32_cvttpd2dq(v2df) | cvttpd2dq |
insert | ||||
S2 | insert a0 i32 | __m128d _mm_cvtsi32_sd(__m128d,int) | v2df __builtin_ia32_cvtsi2sd(v2df,int) | cvtsi2sd |
S2 | insert a0 i64 | __m128d _mm_cvtsi64_sd(__m128d,__int64) __m128d _mm_cvtsi64x_sd(__m128d,__int64) |
v2df __builtin_ia32_cvtsi642sd(v2df,long long) | cvtsi2sd |
S2 | insert a0 f32[0/4] | __m128d _mm_cvtss_sd(__m128d,__m128) | v2df __builtin_ia32_cvtss2sd(v2df,v4sf) | cvtss2sd |
S2 | insert a0 f64[0/2] | __m128d _mm_move_sd(__m128d,__m128d) | v2df __builtin_ia32_movsd(v2df,v2df) | movsd |
extract | ||||
S2 | extract a0 i32 | int _mm_cvtsd_si32(__m128d) int _mm_cvtsd_si32(__m128d) |
int __builtin_ia32_cvtsd2si(v2df) | cvtsd2si |
S2 | extract a0 i64 | __int64 _mm_cvtsd_si64(__m128d) __int64 _mm_cvtsd_si64x(__m128d) |
long long __builtin_ia32_cvtsd2si64(v2df) | cvtsd2si |
S2 | extract.t a0 i32 | int _mm_cvttsd_si32(__m128d) | int __builtin_ia32_cvttsd2si(v2df) | cvttsd2si |
S2 | extract.t a0 i64 | __int64 _mm_cvttsd_si64(__m128d) __int64 _mm_cvttsd_si64x(__m128d) |
long long __builtin_ia32_cvttsd2si64(v2df) | cvttsd2si |
S2 | extract a0 f64 | double _mm_cvtsd_f64(__m128d) | (cast) | movsd |
S2 | extract signbits | int _mm_movemask_pd(__m128d) | int __builtin_ia32_movmskpd(v2df) | movmskpd |
add | ||||
S2 | add | __m128d _mm_add_pd(__m128d,__m128d) | v2df __builtin_ia32_addpd(v2df,v2df) | addpd |
S2 | add[0] | __m128d _mm_add_sd(__m128d,__m128d) | v2df __builtin_ia32_addsd(v2df,v2df) | addsd |
S3 | add ⟨b0+b1,a0+a1⟩ | __m128d _mm_hadd_pd(__m128d,__m128d) | v2df __builtin_ia32_haddpd(v2df,v2df) | haddpd |
sub | ||||
S2 | sub | __m128d _mm_sub_pd(__m128d,__m128d) | v2df __builtin_ia32_subpd(v2df,v2df) | subpd |
S2 | sub[0] | __m128d _mm_sub_sd(__m128d,__m128d) | v2df __builtin_ia32_subsd(v2df,v2df) | subsd |
S3 | sub ⟨b0–b1,a0–a1⟩ | __m128d _mm_hsub_pd(__m128d,__m128d) | v2df __builtin_ia32_hsubpd(v2df,v2df) | hsubpd |
S3 | sub @even, add @odd | __m128d _mm_addsub_pd(__m128d,__m128d) | v2df __builtin_ia32_addsubpd(v2df,v2df) | addsubpd |
mul | ||||
S2 | mul | __m128d _mm_mul_pd(__m128d,__m128d) | v2df __builtin_ia32_mulpd(v2df,v2df) | mulpd |
S2 | mul[0] | __m128d _mm_mul_sd(__m128d,__m128d) | v2df __builtin_ia32_mulsd(v2df,v2df) | mulsd |
div | ||||
S2 | div | __m128d _mm_div_pd(__m128d,__m128d) | v2df __builtin_ia32_divpd(v2df,v2df) | divpd |
S2 | div[0] | __m128d _mm_div_sd(__m128d,__m128d) | v2df __builtin_ia32_divsd(v2df,v2df) | divsd |
S4 | scalprod ∑(ai×bi×{1,0}),di∊{∑,0} | __m128d _mm_dp_pd(__m128d,__m128d,const int imm8) | v2df __builtin_ia32_dppd(v2df,v2dfmint) | dppd |
math | ||||
S2 | min | __m128d _mm_min_pd(__m128d,__m128d) | v2df __builtin_ia32_minpd(v2df,v2df) | minpd |
S2 | min [0] | __m128d _mm_min_sd(__m128d,__m128d) | v2df __builtin_ia32_minsd(v2df,v2df) | minsd |
S2 | max | __m128d _mm_max_pd(__m128d,__m128d) | v2df __builtin_ia32_maxpd(v2df,v2df) | maxpd |
S2 | max [0] | __m128d _mm_max_sd(__m128d,__m128d) | v2df __builtin_ia32_maxsd(v2df,v2df) | maxsd |
S4 | floor() | __m128d _mm_floor_pd(__m128d) | v2df _mm_round_pd(v2df,_MM_FROUND_FLOOR) | roundpd |
S4 | floor() [0] | __m128d _mm_floor_sd(__m128d,__m128d) | v2df _mm_round_sd(v2df,_MM_FROUND_FLOOR) | roundsd |
S4 | ceil() | __m128d _mm_ceil_pd(__m128d) | v2df _mm_round_pd(v2df,_MM_FROUND_CEIL) | roundpd |
S4 | ceil() [0] | __m128d _mm_ceil_sd(__m128d,__m128d) | v2df _mm_round_sd(v2df,_MM_FROUND_CEIL) | roundsd |
S4 | round() | __m128d _mm_round_pd(__m128d,int rounding) | v2df _mm_round_pd(v2df,int) | roundpd |
S4 | round() [0] | __m128d _mm_round_sd(__m128d,__m128d,int rounding) | v2df _mm_round_sd(v2df,int) | roundsd |
S2 | √x | __m128d _mm_sqrt_pd(__m128d) | v2df __builtin_ia32_sqrtpd(v2df) | sqrtpd |
S2 | √x [0] | __m128d _mm_sqrt_sd(__m128d,__m128d) | v2df __builtin_ia32_sqrtsd(v2df) | sqrtsd |
logical | ||||
S2 | ∧ | __m128d _mm_and_pd(__m128d,__m128d) | v2df __builtin_ia32_andpd(v2df,v2df) | andpd |
S2 | ¬∧ | __m128d _mm_andnot_pd(__m128d,__m128d) | v2df __builtin_ia32_andnpd(v2df,v2df) | andnpd |
S2 | ∨ | __m128d _mm_or_pd(__m128d,__m128d) | v2df __builtin_ia32_orpd(v2df,v2df) | orpd |
S2 | ⊕ | __m128d _mm_xor_pd(__m128d,__m128d) | v2df __builtin_ia32_xorpd(v2df,v2df) | xorpd |
compare | ||||
S2 | = | __m128d _mm_cmpeq_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpeqpd(v2df,v2df) | cmppd |
S2 | = [0] | __m128d _mm_cmpeq_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpeqsd(v2df,v2df) | cmpsd |
S2 | = [0] c | int _mm_comieq_sd(__m128d,__m128d) | int __builtin_ia32_comisdeq(v2df,v2df) | comisd |
S2 | = [0] u | int _mm_ucomieq_sd(__m128d,__m128d) | int __builtin_ia32_ucomisdeq(v2df,v2df) | ucomisd |
S2 | ≥ | __m128d _mm_cmpge_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpgepd(v2df,v2df) | cmppd |
S2 | ≥ [0] | __m128d _mm_cmpge_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpgesd(v2df,v2df) | cmpsd |
S2 | ≥ [0] c | int _mm_comige_sd(__m128d,__m128d) | int __builtin_ia32_comisdge(v2df,v2df) | comisd |
S2 | ≥ [0] u | int _mm_ucomige_sd(__m128d,__m128d) | int __builtin_ia32_ucomisdge(v2df,v2df) | ucomisd |
S2 | > | __m128d _mm_cmpgt_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpgtpd(v2df,v2df) | cmppd |
S2 | > [0] | __m128d _mm_cmpgt_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpgtsd(v2df,v2df) | cmpsd |
S2 | > [0] c | int _mm_comigt_sd(__m128d,__m128d) | int __builtin_ia32_comisdgt(v2df,v2df) | comisd |
S2 | > [0] u | int _mm_ucomigt_sd(__m128d,__m128d) | int __builtin_ia32_ucomisdgt(v2df,v2df) | ucomisd |
S2 | ≤ | __m128d _mm_cmple_pd(__m128d,__m128d) | v2df __builtin_ia32_cmplepd(v2df,v2df) | cmppd |
S2 | ≤ [0] | __m128d _mm_cmple_sd(__m128d,__m128d) | v2df __builtin_ia32_cmplesd(v2df,v2df) | cmpsd |
S2 | ≤ [0] c | int _mm_comile_sd(__m128d,__m128d) | int __builtin_ia32_comisdle(v2df,v2df) | comisd |
S2 | ≤ [0] u | int _mm_ucomile_sd(__m128d,__m128d) | int __builtin_ia32_ucomisdle(v2df,v2df) | ucomisd |
S2 | < | __m128d _mm_cmplt_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpltpd(v2df,v2df) | cmppd |
S2 | < [0] | __m128d _mm_cmplt_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpltsd(v2df,v2df) | cmpsd |
S2 | < [0] c | int _mm_comilt_sd(__m128d,__m128d) | int __builtin_ia32_comisdlt(v2df,v2df) | comisd |
S2 | < [0] u | int _mm_ucomilt_sd(__m128d,__m128d) | int __builtin_ia32_ucomisdlt(v2df,v2df) | ucomisd |
S2 | ≠ | __m128d _mm_cmpneq_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpneqpd(v2df,v2df) | cmppd |
S2 | ≠ [0] | __m128d _mm_cmpneq_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpneqsd(v2df,v2df) | cmpsd |
S2 | ≠ [0] c | int _mm_comineq_sd(__m128d,__m128d) | int __builtin_ia32_comisdneq(v2df,v2df) | comisd |
S2 | ≠ [0] u | int _mm_ucomineq_sd(__m128d,__m128d) | int __builtin_ia32_ucomisdneq(v2df,v2df) | ucomisd |
S2 | ≱ | __m128d _mm_cmpnge_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpngepd(v2df,v2df) | cmppd |
S2 | ≱ [0] | __m128d _mm_cmpnge_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpngesd(v2df,v2df) | cmpsd |
S2 | ≯ | __m128d _mm_cmpngt_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpngtpd(v2df,v2df) | cmppd |
S2 | ≯ [0] | __m128d _mm_cmpngt_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpngtsd(v2df,v2df) | cmpsd |
S2 | ≰ | __m128d _mm_cmpnle_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpnlepd(v2df,v2df) | cmppd |
S2 | ≰ [0] | __m128d _mm_cmpnle_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpnlesd(v2df,v2df) | cmpsd |
S2 | ≮ | __m128d _mm_cmpnlt_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpnltpd(v2df,v2df) | cmppd |
S2 | ≮ [0] | __m128d _mm_cmpnlt_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpnltsd(v2df,v2df) | cmpsd |
S2 | ≶ | __m128d _mm_cmpord_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpordpd(v2df,v2df) | cmppd |
S2 | ≶ [0] | __m128d _mm_cmpord_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpordsd(v2df,v2df) | cmpsd |
S2 | ≸ | __m128d _mm_cmpunord_pd(__m128d,__m128d) | v2df __builtin_ia32_cmpunordpd(v2df,v2df) | cmppd |
S2 | ≸ [0] | __m128d _mm_cmpunord_sd(__m128d,__m128d) | v2df __builtin_ia32_cmpunordsd(v2df,v2df) | cmpsd |
A | compare(op) | __m128d _mm_cmp_pd(__m128d,__m128d,const int imm8) | v2df __builtin_ia32_cmppd(v2df,v2df,int) | vcmppd |
A | compare(op) [0] | __m128d _mm_cmp_sd(__m128d,__m128d,const int imm8) | v2df __builtin_ia32_cmpsd(v2df,v2df,int) | vcmpsd |
A | a≊0 (masked by b) | int _mm_testz_pd(__m128d,__m128d) | int __builtin_ia32_vtestzpd(v2df,v2df) | vtestpd |
A | a≊1 (masked by b) | int _mm_testc_pd(__m128d,__m128d) | int __builtin_ia32_vtestcpd(v2df,v2df) | vtestpd |
A | a≇0 ∧ a≇1 (masked by b) | int _mm_testnzc_pd(__m128d,__m128d) | int __builtin_ia32_vtestnzcpd(v2df,v2df) | vtestpd |
shuffle | ||||
S3 | dup ⟨a0,a0⟩ | __m128d _mm_movedup_pd(__m128d) | v2df __builtin_ia32_movddup(v2df) | movddup |
A2 | dup ⟨a0,a0⟩ (clr hi) | __m128d _mm_broadcastsd_pd(__m128d) | v2df __builtin_ia32_shufpd(v2df,v2df copy,0) | movddup |
S2 | interleave ⟨b0,a0⟩ | __m128d _mm_unpacklo_pd(__m128d,__m128d) | v2df __builtin_ia32_unpcklpd(v2df,v2df) | unpcklpd |
S2 | interleave ⟨b1,a1⟩ | __m128d _mm_unpackhi_pd(__m128d,__m128d) | v2df __builtin_ia32_unpckhpd(v2df,v2df) | unpckhpd |
S4 | blend ⟨imm8i ? bi : ai⟩ | __m128d _mm_blend_pd(__m128d,__m128d,const int imm8) | v2df __builtin_ia32_blendpd(v2df,v2df,int) | blendpd |
S4 | blend ⟨ci<0 ? bi : ai⟩ | __m128d _mm_blendv_pd(__m128d,__m128d,__m128d mask) | v2df __builtin_ia32_blendvpd(v2df,v2df,v2df) | blendvpd |
S2 | shuffle ⟨bj,ai⟩ | __m128d _mm_shuffle_pd(__m128d,__m128d,int imm8) | v2df __builtin_ia32_shufpd(v2df,v2df,int) | shufpd |
A | shuffle ⟨aj,ai⟩ | __m128d _mm_permute_pd(__m128d,int imm8) | v2df __builtin_ia32_vpermilpd(v2df,int) | vpermilpd |
A | shuffle ⟨abj,abi⟩ | __m128d _mm_permutevar_pd(__m128d,__m128i) | v2df __builtin_ia32_vpermilvarpd(v2df,v2di) | vpermilpd |
convert | ||||
A2 | convert.s ← i8[3:0/16] | __m256i _mm256_cvtepi8_epi64(__m128i) | v4di __builtin_ia32_pmovsxbq256(v16qi) | vpmovsxbq |
A2 | convert.u ← u8[3:0/16] | __m256i _mm256_cvtepu8_epi64(__m128i) | v4di __builtin_ia32_pmovzxbq256(v16qi) | vpmovzxbq |
A2 | convert.s ← i16[3:0/8] | __m256i _mm256_cvtepi16_epi64(__m128i) | v4di __builtin_ia32_pmovsxwq256(v8hi) | vpmovsxwq |
A2 | convert.u ← u16[3:0/8] | __m256i _mm256_cvtepu16_epi64(__m128i) | v4di __builtin_ia32_pmovzxwq256(v8hi) | vpmovzxwq |
A2 | convert.s ← i32[3:0/4] | __m256i _mm256_cvtepi32_epi64(__m128i) | v4di __builtin_ia32_pmovsxdq256(v4si) | vpmovsxdq |
A2 | convert.u ← u32[3:0/4] | __m256i _mm256_cvtepu32_epi64(__m128i) | v4di __builtin_ia32_pmovzxdq256(v4si) | vpmovzxdq |
arithmetic | ||||
A2 | add | __m256i _mm256_add_epi64(__m256i,__m256i) | (+) | vpaddq |
A2 | sub | __m256i _mm256_sub_epi64(__m256i,__m256i) | (-) | vpsubq |
shift | ||||
A2 | shift left | __m256i _mm256_sll_epi64(__m256i,__m128i count) | v4di __builtin_ia32_psllq256(v4di,v4di) | vpsllq |
A2 | shift left imm | __m256i _mm256_slli_epi64(__m256i,int imm8) | v4di __builtin_ia32_psllqi256(v4di,int) | vpsllq |
A2 | shift left variable | __m256i _mm256_sllv_epi64(__m256i,__m256i count) | v4di __builtin_ia32_psllv4di(v4di,v4di) | vpsllvq |
A2 | shift right | __m256i _mm256_srl_epi64(__m256i,__m128i count) | v4di __builtin_ia32_psrlq256(v4di,v4di) | vpsrlq |
A2 | shift right imm | __m256i _mm256_srli_epi64(__m256i,int imm8) | v4di __builtin_ia32_psrlqi256(v4di,int) | vpsrlq |
A2 | shift right variable | __m256i _mm256_srlv_epi64(__m256i,__m256i count) | v4di __builtin_ia32_psrlv4di(v4di,v4di) | vpsrlvq |
compare | ||||
A2 | = | __m256i _mm256_cmpeq_epi64(__m256i,__m256i) | (==) | vpcmpeqq |
A2 | > | __m256i _mm256_cmpgt_epi64(__m256i,__m256i) | (>) | vpcmpgtq |
shuffle | ||||
A2 | dup ⟨a0,a0,a0,a0⟩ | __m256i _mm256_broadcastq_epi64(__m128i) | v4di __builtin_ia32_pbroadcastq256(v2di) | vpbroadcastq |
A2 | interleave ⟨b2,a2,b0,a0⟩ | __m256i _mm256_unpacklo_epi64(__m256i,__m256i) | v4di __builtin_ia32_punpcklqdq256(v4di,v4di) | vpunpcklqdq |
A2 | interleave ⟨b3,a3,b1,a1⟩ | __m256i _mm256_unpackhi_epi64(__m256i,__m256i) | v4di __builtin_ia32_punpckhqdq256(v4di,v4di) | vpunpckhqdq |
A2 | shuffle ⟨al,ak,aj,ai⟩ | __m256i _mm256_permute4x64_epi64(__m256i,const int imm8) | v4di __builtin_ia32_permdi256(v4di,int) | vpermq |
AVX: 256 Bits
AVX(256 bit)
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
A | const(0) | __m256i _mm256_setzero_si256(void) | (cast) | vpxor |
A | YMM[15:0]≔0 | void _mm256_zeroall(void) | void __builtin_ia32_vzeroall(void) | vzeroall |
A | YMM[15:0].hi≔0 | void _mm256_zeroupper(void) | void __builtin_ia32_vzeroupper(void) | vzeroupper |
cast | ||||
A | cast | __m128d _mm256_castpd256_pd128(__m256d) | ∅ | - |
A | cast | __m128i _mm256_castsi256_si128(__m256i) | ∅ | - |
A | cast | __m128 _mm256_castps256_ps128(__m256) | ∅ | - |
A | cast | __m256d _mm256_castpd128_pd256(__m128d) | ∅ | - |
A | cast | __m256d _mm256_castps_pd(__m256) | ∅ | - |
A | cast | __m256d _mm256_castsi256_pd(__m256i) | ∅ | - |
A | cast | __m256d _mm256_undefined_pd(void) | ∅ | - |
A | cast | __m256d _mm256_zextpd128_pd256(__m128d) | ∅ | - |
A | cast | __m256i _mm256_castpd_si256(__m256d) | ∅ | - |
A | cast | __m256i _mm256_castps_si256(__m256) | ∅ | - |
A | cast | __m256i _mm256_castsi128_si256(__m128i) | v8si __builtin_ia32_si256_si(v4si) | - |
A | cast | __m256i _mm256_undefined_si256(void) | ∅ | - |
A | cast | __m256i _mm256_zextsi128_si256(__m128i) | ∅ | - |
A | cast | __m256 _mm256_castpd_ps(__m256d) | ∅ | - |
A | cast | __m256 _mm256_castps128_ps256(__m128) | ∅ | - |
A | cast | __m256 _mm256_castsi256_ps(__m256i) | ∅ | - |
A | cast | __m256 _mm256_undefined_ps(void) | ∅ | - |
A | cast | __m256 _mm256_zextps128_ps256(__m128) | ∅ | - |
AVX(256 bit) int8 × 32
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
extract | ||||
A2 | extract signbits | int _mm256_movemask_epi8(__m256i) | int __builtin_ia32_pmovmskb256(v32qi) | vpmovmskb |
add | ||||
A2 | add | __m256i _mm256_add_epi8(__m256i,__m256i) | (+) | vpaddb |
A2 | adds.s | __m256i _mm256_adds_epi8(__m256i,__m256i) | v32qi __builtin_ia32_paddsb256(v32qi) | vpaddsb |
A2 | adds.u | __m256i _mm256_adds_epu8(__m256i,__m256i) | v32qi __builtin_ia32_paddusb256(v32qi) | vpaddusb |
A2 | average.u | __m256i _mm256_avg_epu8(__m256i,__m256i) | v32qi __builtin_ia32_pavgb256(v32qi,v32qi) | vpavgb |
sub | ||||
A2 | sub | __m256i _mm256_sub_epi8(__m256i,__m256i) | (-) | vpsubb |
A2 | subs.s | __m256i _mm256_subs_epi8(__m256i,__m256i) | v32qi __builtin_ia32_psubsb256(v32qi,v32qi) | vpsubsb |
A2 | subs.u | __m256i _mm256_subs_epu8(__m256i,__m256i) | v32qi __builtin_ia32_psubusb256(v32qi,v32qi) | vpsubusb |
A2 | i16 dj≔ ∑i∊{0…3}Δi
j∊{0..7} Δi=abs(ap+j+i-bq+i) p∊{0,4}, q∊{0,4,8,12} (2 lanes) |
__m256i _mm256_mpsadbw_epu8(__m256i,__m256i,const int imm8) | v16hi __builtin_ia32_mpsadbw256(v32qi,v32qi,int) | vmpsadbw |
A2 | i64 dj≔ ∑i∊{0…7}Δi
j∊{0..3} Δi=abs(a4j+i-b4j+i) |
__m256i _mm256_sad_epu8(__m256i,__m256i) | v4di __builtin_ia32_psadbw256(v32qi,v32qi) | vpsadbw |
mul | ||||
A2 | mul ai×sign(bi) | __m256i _mm256_sign_epi8(__m256i,__m256i) | v32qi __builtin_ia32_psignb256(v32qi,v32qi) | vpsignb |
A2 | 16×scalprod.u saturated→i16 | __m256i _mm256_maddubs_epi16(__m256i,__m256i) | v16hi __builtin_ia32_pmaddubsw256(v32qi,v32qi) | vpmaddubsw |
math | ||||
A2 | min.s | __m256i _mm256_min_epi8(__m256i,__m256i) | v32qi __builtin_ia32_pminsb256(v32qi,v32qi) | vpminsb |
A2 | min.u | __m256i _mm256_min_epu8(__m256i,__m256i) | v32qi __builtin_ia32_pminub256(v32qi,v32qi) | vpminub |
A2 | max.s | __m256i _mm256_max_epi8(__m256i,__m256i) | v32qi __builtin_ia32_pmaxsb256(v32qi,v32qi) | vpmaxsb |
A2 | max.u | __m256i _mm256_max_epu8(__m256i,__m256i) | v32qi __builtin_ia32_pmaxub256(v32qi,v32qi) | vpmaxub |
A2 | abs.s | __m256i _mm256_abs_epi8(__m256i) | v32qi __builtin_ia32_pabsb256(v32qi,v32qi) | vpabsb |
shift | ||||
A2 | shift concat bytes right | __m256i _mm256_alignr_epi8(__m256i,__m256i,const int imm8) | v32qi __builtin_ia32_palignr256(v32qi,v32qi,int) | vpalignr |
compare | ||||
A2 | = | __m256i _mm256_cmpeq_epi8(__m256i,__m256i) | (==) | vpcmpeqb |
A2 | > | __m256i _mm256_cmpgt_epi8(__m256i,__m256i) | (>) | vpcmpgtb |
shuffle | ||||
A2 | dup ⟨a0,…,a0⟩ | __m256i _mm256_broadcastb_epi8(__m128i) | 32qi __builtin_ia32_pbroadcastb256(16qi) | vpbroadcastb |
A2 | interleave ⟨b23,a23,…,b16,a16⟩ ⟨b7,a7,…,b0,a0⟩ |
__m256i _mm256_unpacklo_epi8(__m256i,__m256i) | v32qi __builtin_ia32_punpcklbw256(v32qi,v32qi) | vpunpcklbw |
A2 | interleave ⟨b31,a31,…,b24,a24⟩ ⟨b15,a15,…,b8,a8⟩ |
__m256i _mm256_unpackhi_epi8(__m256i,__m256i) | v32qi __builtin_ia32_punpckhbw256(v32qi,v32qi) | vpunpckhbw |
A2 | blend ⟨di ≔ ci<0 ? bi : ai⟩ | __m256i _mm256_blendv_epi8(__m256i,__m256i,__m256i mask) | v32qi __builtin_ia32_pblendvb256(v32qi,v32qi,v32qi) | vpblendvb |
A2 | shuffle ⟨bi≥0 ? abi
: 0⟩ (2 lanes) |
__m256i _mm256_shuffle_epi8(__m256i,__m256i) | v32qi __builtin_ia32_pshufb256(v32qi,v32qi) | vpshufb |
AVX(256 bit) int16 × 16
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
convert | ||||
A2 | convert.s ← i8[15:0/16] | __m256i _mm256_cvtepi8_epi16(__m128i) | v16hi __builtin_ia32_pmovsxbw256(v16hi,v16hi) | vpmovsxbw |
A2 | convert.s ← u8[15:0/16] | __m256i _mm256_cvtepu8_epi16(__m128i) | v16hi __builtin_ia32_pmovzxbw256(v16hi,v16hi) | vpmovzxbw |
add | ||||
A2 | add | __m256i _mm256_add_epi16(__m256i,__m256i) | (+) | vpaddw |
A2 | adds.s | __m256i _mm256_adds_epi16(__m256i,__m256i) | v16hi __builtin_ia32_paddsw256(v16hi,v16hi) | vpaddsw |
A2 | adds.u | __m256i _mm256_adds_epu16(__m256i,__m256i) | v16hi __builtin_ia32_paddusw256(v16hi,v16hi) | vpaddusw |
A2 | add ⟨b14+b15,b12+b13,b10+b11,b8+b9⟩ ⟨a14+a15,a12+a13,a10+a11,a8+a9⟩ ⟨b6+b7,b4+b5,b2+b3,b0+b1⟩ ⟨a6+a7,a4+a5,a2+a3,a0+a1⟩ |
__m256i _mm256_hadd_epi16(__m256i,__m256i) | v16hi __builtin_ia32_phaddw256(v16hi,v16hi) | vphaddw |
A2 | adds ⟨b14+b15,b12+b13,b10+b11,b8+b9⟩ ⟨a14+a15,a12+a13,a10+a11,a8+a9⟩ ⟨b6+b7,b4+b5,b2+b3,b0+b1⟩ ⟨a6+a7,a4+a5,a2+a3,a0+a1⟩ |
__m256i _mm256_hadds_epi16(__m256i,__m256i) | v16hi __builtin_ia32_phaddsw256(v16hi,v16hi) | vphaddsw |
A2 | average.u | __m256i _mm256_avg_epu16(__m256i,__m256i) | v16hi __builtin_ia32_pavgw256(v16hi,v16hi) | vpavgw |
sub | ||||
A2 | sub | __m256i _mm256_sub_epi16(__m256i,__m256i) | (-) | vpsubw |
A2 | subs.s | __m256i _mm256_subs_epi16(__m256i,__m256i) | v16hi __builtin_ia32_psubsw256(v16hi,v16hi) | vpsubsw |
A2 | subs.u | __m256i _mm256_subs_epu16(__m256i,__m256i) | v16hi __builtin_ia32_psubusw256(v16hi,v16hi) | vpsubusw |
A2 | sub ⟨b14–b15,b12–b13,b10–b11,b8–b9⟩ ⟨a14–a15,a12–a13,a10–a11,a8–a9⟩ ⟨b6–b7,b4–b5,b2–b3,b0–b1⟩ ⟨a6–a7,a4–a5,a2–a3,a0–a1⟩ |
__m256i _mm256_hsub_epi16(__m256i,__m256i) | v16hi __builtin_ia32_phsubw256(v16hi,v16hi) | vphsubw |
A2 | subs ⟨b14–b15,b12–b13,b10–b11,b8–b9⟩ ⟨a14–a15,a12–a13,a10–a11,a8–a9⟩ ⟨b6–b7,b4–b5,b2–b3,b0–b1⟩ ⟨a6–a7,a4–a5,a2–a3,a0–a1⟩ |
__m256i _mm256_hsubs_epi16(__m256i,__m256i) | v16hi __builtin_ia32_phsubsw256(v16hi,v16hi) | vphsubsw |
mul | ||||
A2 | mul.lo | __m256i _mm256_mullo_epi16(__m256i,__m256i) | (×) | vpmullw |
A2 | muls.hi | __m256i _mm256_mulhi_epi16(__m256i,__m256i) | v16hi __builtin_ia32_pmulhw256(v16hi,v16hi) | vpmulhw |
A2 | mulu.hi | __m256i _mm256_mulhi_epu16(__m256i,__m256i) | v16hi __builtin_ia32_pmulhuw256(v16hi,v16hi) | vpmulhuw |
A2 | muls(≫ 14) | __m256i _mm256_mulhrs_epi16(__m256i,__m256i) | v16hi __builtin_ia32_pmulhrsw256(v16hi,v16hi) | vpmulhrsw |
A2 | mul ai×sign(bi) | __m256i _mm256_sign_epi16(__m256i,__m256i) | v16hi __builtin_ia32_psignw256(v16hi,v16hi) | vpsignw |
A2 | 8×scalprod.s unsaturated→i32 | __m256i _mm256_madd_epi16(__m256i,__m256i) | v8si __builtin_ia32_pmaddwd256(v16hi,v16hi) | vpmaddwd |
math | ||||
A2 | min.s | __m256i _mm256_min_epi16(__m256i,__m256i) | v16hi __builtin_ia32_pminsw256(v16hi,v16hi) | vpminsw |
A2 | min.u | __m256i _mm256_min_epu16(__m256i,__m256i) | v16hi __builtin_ia32_pminuw256(v16hi,v16hi) | vpminuw |
A2 | max.s | __m256i _mm256_max_epi16(__m256i,__m256i) | v16hi __builtin_ia32_pmaxsw256(v16hi,v16hi) | vpmaxsw |
A2 | max.u | __m256i _mm256_max_epu16(__m256i,__m256i) | v16hi __builtin_ia32_pmaxuw256(v16hi,v16hi) | vpmaxuw |
A2 | abs.s | __m256i _mm256_abs_epi16(__m256i) | v16hi __builtin_ia32_pabsw256(v16hi) | vpabsw |
shift | ||||
A2 | shift left | __m256i _mm256_sll_epi16(__m256i,__m128i count) | v16hi __builtin_ia32_psllw256(v16hi,v8hi) | vpsllw |
A2 | shift left | __m256i _mm256_slli_epi16(__m256i,int imm8) | v16hi __builtin_ia32_psllwi256(v16hi,int) | vpsllw |
A2 | shift right.s | __m256i _mm256_sra_epi16(__m256i,__m128i count) | v16hi __builtin_ia32_psraw256(v16hi,v8hi) | vpsraw |
A2 | shift right.s | __m256i _mm256_srai_epi16(__m256i,int imm8) | v16hi __builtin_ia32_psrawi256(v16hi,int) | vpsraw |
A2 | shift right.u | __m256i _mm256_srl_epi16(__m256i,__m128i count) | v16hi __builtin_ia32_psrlw256(v16hi,v8hi) | vpsrlw |
A2 | shift right.u | __m256i _mm256_srli_epi16(__m256i,int imm8) | v16hi __builtin_ia32_psrlwi256(v16hi,int) | vpsrlw |
compare | ||||
A2 | = | __m256i _mm256_cmpeq_epi16(__m256i,__m256i) | (==) | vpcmpeqw |
A2 | > | __m256i _mm256_cmpgt_epi16(__m256i,__m256i) | (>) | vpcmpgtw |
reduce | ||||
A2 | reduce+saturate.s→i8 | __m256i _mm256_packs_epi16(__m256i,__m256i) | v32qi __builtin_ia32_packsswb256(v16hi,v16hi) | vpacksswb |
A2 | reduce+saturate.u→u8 | __m256i _mm256_packus_epi16(__m256i,__m256i) | v32qi __builtin_ia32_packuswb256(v16hi,v16hi) | vpackuswb |
shuffle | ||||
A2 | dup ⟨a0,…,a0) | __m256i _mm256_broadcastw_epi16(__m128i) | v16hi __builtin_ia32_pbroadcastw256(v8hi) | vpbroadcastw |
A2 | interleave ⟨b11,a11,…,b8,a8⟩ ⟨b3,a3,…,b0,a0⟩ |
__m256i _mm256_unpacklo_epi16(__m256i,__m256i) | v16hi __builtin_ia32_punpcklwd256(v16hi,v16hi) | vpunpcklwd |
A2 | interleave ⟨b15,a15,…,b12,a12⟩ ⟨b7,a7,…,b7,48⟩ |
__m256i _mm256_unpackhi_epi16(__m256i,__m256i) | v16hi __builtin_ia32_punpckhwd256(v16hi,v16hi) | vpunpckhwd |
A2 | blend ⟨di ≔ immi ? bi : ai⟩ | __m256i _mm256_blend_epi16(__m256i,__m256i,const int imm8) | v16hi __builtin_ia32_pblendw256(v16hi,v16hi,int) | vpblendw |
A2 | shuffle ⟨a7,a6,a5,a4⟩ ⟨al,ak,aj,ai⟩ ijkl∊{0…3} (2 lanes) |
__m256i _mm256_shufflelo_epi16(__m256i,const int imm8) | v16hi __builtin_ia32_pshuflw256(v16hi,int) | vpshuflw |
A2 | shuffle ⟨al,ak,aj,ai⟩ ijkl∊{4…7} ⟨a3,a2,a1,a0⟩ (2 lanes) |
__m256i _mm256_shufflehi_epi16(__m256i,const int imm8) | v16hi __builtin_ia32_pshufhw256(v16hi,int) | vpshufhw |
AVX(256 bit) int32 × 8
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
load | ||||
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m256i _mm256_i32gather_epi32(int const*base_addr,__m256i vindex,const int scale) | v8si __builtin_ia32_gathersiv8si(0,int const*,v8si,~0,int) | vpgatherdd |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m256i _mm256_mask_i32gather_epi32(__m256i src,int const*base_addr,__m256i vindex,__m256i mask,const int scale) | v8si __builtin_ia32_gathersiv8si(v8si,int const*,v8si,v8si,int) | vpgatherdd |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m128i _mm256_i64gather_epi32(int const*base_addr,__m256i vindex,const int scale) | v8si __builtin_ia32_gatherdiv4si256(0,int const*,v4di,~0,int) | vpgatherqd |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m128i _mm256_mask_i64gather_epi32(__m128i src,int const*base_addr,__m256i vindex,__m128i mask,const int scale) | v8si __builtin_ia32_gatherdiv4si256(v4si,int const*,v4di,v4si,int) | vpgatherqd |
A2 | load ⟨bi<0 ? mi : 0⟩ | __m256i _mm256_maskload_epi32(int const*mem_addr,__m256i mask) | v8si __builtin_ia32_maskloadd256(cont v8si*,v8si) | vpmaskmovd |
store | ||||
A2 | store ⟨mi ≔ bi if ci<0⟩ | void _mm256_maskstore_epi32(int*mem_addr,__m256i mask,__m256i) | void __builtin_ia32_maskstored256(v8si*,v8si,v8si) | vpmaskmovd |
convert | ||||
A | create ⟨?,?,?,?,a3:0⟩ i32[4] | __m256i _mm256_castsi128_si256(__m128i) | v8si __builtin_ia32_si256_si(v4si) | - |
A2 | convert.s ← i8[7:0/16] | __m256i _mm256_cvtepi8_epi32(__m128i) | v8si __builtin_ia32_pmovsxbd256(v16qi) | vpmovsxbd |
A2 | convert.u ← u8[7:0/16] | __m256i _mm256_cvtepu8_epi32(__m128i) | v8si __builtin_ia32_pmovzxbd256(v16qi) | vpmovzxbd |
A2 | convert.s ← i16[7:0/8] | __m256i _mm256_cvtepi16_epi32(__m128i) | v8si __builtin_ia32_pmovsxwd256(v8hi) | vpmovsxwd |
A2 | convert.u ← u16[7:0/8] | __m256i _mm256_cvtepu16_epi32(__m128i) | v8si __builtin_ia32_pmovzxwd256(v8hi) | vpmovzxwd |
extract | ||||
A | extract a0 i32 | int _mm256_cvtsi256_si32(__m256i) | (cast) | vmovd |
A | extract a3:0 i32[4] | __m128i _mm256_castsi256_si128(__m256i) | v4si __builtin_ia32_si_si256(v8si) | - |
add | ||||
A2 | add | __m256i _mm256_add_epi32(__m256i,__m256i) | (+) | vpaddd |
A2 | add ⟨b6+b7,b4+b5,a6+a7,a4+a5⟩ ⟨b2+b3,b0+b1,a2+a3,a0+a1⟩ |
__m256i _mm256_hadd_epi32(__m256i,__m256i) | v8si __builtin_ia32_phaddd256(v8si,v8si) | vphaddd |
sub | ||||
A2 | sub | __m256i _mm256_sub_epi32(__m256i,__m256i) | (-) | vpsubd |
A2 | sub ⟨b6–b7,b4–b5,a6–a7,a4–a5⟩ ⟨b2–b3,b0–b1,a2–a3,a0–a1⟩ |
__m256i _mm256_hsub_epi32(__m256i,__m256i) | v8si __builtin_ia32_phsubd256(v8si,v8si) | vphsubd |
mul | ||||
A2 | mul(lo) | __m256i _mm256_mullo_epi32(__m256i,__m256i) | (×) | vpmulld |
A2 | mul.s ⟨a.even×b.even⟩→i64[4] | __m256i _mm256_mul_epi32(__m256i,__m256i) | v4di __builtin_ia32_pmuldq256(v8si,v8si) | vpmuldq |
A2 | mul.u ⟨a.even×b.even⟩→i64[4] | __m256i _mm256_mul_epu32(__m256i,__m256i) | v4di __builtin_ia32_pmuludq256(v8si,v8si) | vpmuludq |
A2 | mul ai×sign(bi) | __m256i _mm256_sign_epi32(__m256i,__m256i) | v8di __builtin_ia32_psignd256(v8di,v8di) | vpsignd |
math | ||||
A2 | min.s | __m256i _mm256_min_epi32(__m256i,__m256i) | v8si __builtin_ia32_pminsd256(v8si,v8si) | vpminsd |
A2 | min.u | __m256i _mm256_min_epu32(__m256i,__m256i) | v8si __builtin_ia32_pminud256(v8si,v8si) | vpminud |
A2 | max.s | __m256i _mm256_max_epi32(__m256i,__m256i) | v8si __builtin_ia32_pmaxsd256(v8si,v8si) | vpmaxsd |
A2 | max.u | __m256i _mm256_max_epu32(__m256i,__m256i) | v8si __builtin_ia32_pmaxud256(v8si,v8si) | vpmaxud |
A2 | abs.s | __m256i _mm256_abs_epi32(__m256i) | v8si __builtin_ia32_pabsd256(v8si) | vpabsd |
shift | ||||
A2 | shift left | __m256i _mm256_sll_epi32(__m256i,__m128i count) | v8si __builtin_ia32_pslld256(v8si,v4si) | vpslld |
A2 | shift left imm | __m256i _mm256_slli_epi32(__m256i,int imm8) | v8si __builtin_ia32_pslldi256(v8si,int) | vpslld |
A2 | shift left variable | __m256i _mm256_sllv_epi32(__m256i,__m256i count) | v8si __builtin_ia32_psllv8si(v8si,v8si) | vpsllvd |
A2 | shift right.s | __m256i _mm256_sra_epi32(__m256i,__m128i count) | v8si __builtin_ia32_psrad256(v8si,v4si) | vpsrad |
A2 | shift right.s imm | __m256i _mm256_srai_epi32(__m256i,int imm8) | v8si __builtin_ia32_psradi256(v8si,int) | vpsrad |
A2 | shift right.s variable | __m256i _mm256_srav_epi32(__m256i,__m256i count) | v8si __builtin_ia32_psrav8si(v8si,v8si) | vpsravd |
A2 | shift right.u | __m256i _mm256_srl_epi32(__m256i,__m128i count) | v8si __builtin_ia32_psrld256(v8si,v4si) | vpsrld |
A2 | shift right.u imm | __m256i _mm256_srli_epi32(__m256i,int imm8) | v8si __builtin_ia32_psrldi256(v8si,int) | vpsrld |
A2 | shift right.u variable | __m256i _mm256_srlv_epi32(__m256i,__m256i count) | v8si __builtin_ia32_psrlv8si(v8si,v8si) | vpsrlvd |
compare | ||||
A2 | = | __m256i _mm256_cmpeq_epi32(__m256i,__m256i) | (==) | vpcmpeqd |
A2 | > | __m256i _mm256_cmpgt_epi32(__m256i,__m256i) | (>) | vpcmpgtd |
reduce | ||||
A2 | reduce+saturate.s→i16 | __m256i _mm256_packs_epi32(__m256i,__m256i) | v16hi __builtin_ia32_packssdw256(v8si,v8si) | vpackssdw |
A2 | reduce+saturate.u→u16 | __m256i _mm256_packus_epi32(__m256i,__m256i) | v16hi __builtin_ia32_packusdw256(v8si,v8si) | vpackusdw |
shuffle | ||||
A2 | dup ⟨a0,…,a0) | __m256i _mm256_broadcastd_epi32(__m128i) | v8si __builtin_ia32_pbroadcastd256(v4si) | vpbroadcastd |
A2 | interleave ⟨b5,a5,b4,a4⟩ ⟨b1,a1,b0,a0⟩ |
__m256i _mm256_unpacklo_epi32(__m256i,__m256i) | v8si __builtin_ia32_punpckldq256(v8si,v8si) | vpunpckldq |
A2 | interleave ⟨b7,a7,b6,a6⟩ ⟨b3,a3,b2,a2⟩ |
__m256i _mm256_unpackhi_epi32(__m256i,__m256i) | v8si __builtin_ia32_punpckhdq256(v8si,v8si) | vpunpckhdq |
A2 | blend ⟨di ≔ immi ? bi : ai⟩ | __m256i _mm256_blend_epi32(__m256i,__m256i,const int imm8) | v8si __builtin_ia32_pblendd256(v8si,v8si,int) | vpblendd |
A2 | shuffle ⟨di ≔ abi⟩ | __m256i _mm256_permutevar8x32_epi32(__m256i,__m256i idx) | v8si __builtin_ia32_permvarsi256(v8si,v8si) | vpermd |
A2 | shuffle ⟨al,ak,aj,ai⟩ (2 lanes) | __m256i _mm256_shuffle_epi32(__m256i,const int imm8) | v8si __builtin_ia32_pshufd256(v8si,int) | vpshufd |
AVX(256 bit) int64 × 4
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
load | ||||
A2 | gather ofs:i32 ⟨mofsi⟩ ofs:i32 |
__m256i _mm256_i32gather_epi64(__int64 const*base_addr,__m128i vindex,const int scale) | v4di __builtin_ia32_gathersiv4di(0,int64 const*,v4si,~0,int) | vpgatherdq |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ i32 |
__m256i _mm256_mask_i32gather_epi64(__m256i src,__int64 const*base_addr,__m128i vindex,__m256i mask,const int scale) | v4di __builtin_ia32_gathersiv4di(v4di,int64 const*,v4si,v4di,int) | vpgatherdq |
A2 | gather ofs:i64 ⟨mofsi⟩ ofs:i64 |
__m256i _mm256_i64gather_epi64(__int64 const*base_addr,__m256i vindex,const int scale) | v4di __builtin_ia32_gatherdiv4di(0,int64 const*,v4di,~0,int) | vpgatherqq |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ ofs:i64 |
__m256i _mm256_mask_i64gather_epi64(__m256i src,__int64 const*base_addr,__m256i vindex,__m256i mask,const int scale) | v4di __builtin_ia32_gatherdiv4di(v4di,int64 const*,v4di,v4di,int) | vpgatherqq |
A2 | load ⟨bi<0 ? mi : 0⟩ | __m256i _mm256_maskload_epi64(__int64 const*mem_addr,__m256i mask) | v4di __builtin_ia32_maskloadq256(const v4di*,v4di) | vpmaskmovq |
store | ||||
A2 | store ⟨mi ≔ bi if ci<0⟩ | void _mm256_maskstore_epi64(__int64*mem_addr,__m256i mask,__m256i) | void __builtin_ia32_maskstoreq256(v4di*,v4di,v4di) | vpmaskmovq |
AVX(256 bit) int128 × 2
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
insert | ||||
A | insert ai ≔ b0 i128 | __m256i _mm256_insertf128_si256(__m256i,__m128i,int imm8) | v8si __builtin_ia32_vinsertf128_si256(v8si,v4si,int) | vinsertf128 |
A2 | insert ai ≔ b0 i128 | __m256i _mm256_inserti128_si256(__m256i,__m128i,const int imm8) | v2ti __builtin_ia32_insert128i256(v4di,v2di,int) | vinserti128 |
extract | ||||
A | extract ai i128 | __m128i _mm256_extractf128_si256(__m256i,const int imm8) | v4si __builtin_ia32_vextractf128_si256(v8si,int) | vextractf128 |
A2 | extract ai i128 | __m128i _mm256_extracti128_si256(__m256i,const int imm8) | v2di __builtin_ia32_extract128i256(v4di,int) | vextracti128 |
shift | ||||
A2 | shift bytes left 128 bit lanes |
__m256i _mm256_bslli_epi128(__m256i,const int imm8) __m256i _mm256_slli_si256(__m256i,const int imm8) |
v2ti __builtin_ia32_pslldqi256(v2ti,int bits) | vpslldq |
A2 | shift bytes right.u 128 bit lanes |
__m256i _mm256_bsrli_epi128(__m256i,const int imm8) __m256i _mm256_srli_si256(__m256i,const int imm8) |
v2ti __builtin_ia32_psrldqi256(v2ti,int bits) | vpsrldq |
shuffle | ||||
A2 | dup ⟨a0,a0⟩ | __m256i _mm256_broadcastsi128_si256(__m128i) __m256i _mm_broadcastsi128_si256(__m128i) |
v2ti __builtin_ia32_vbroadcastsi256(v2di) | vbroadcasti128 |
A | 2×sel{0,b1,a1,b0,a0} | __m256i _mm256_permute2f128_si256(__m256i,__m256i,int imm8) | v8si __builtin_ia32_vperm2f128_si256(v8si,v8si,int) | vperm2f128 |
A2 | 2×sel{0,b1,a1,b0,a0} | __m256i _mm256_permute2x128_si256(__m256i,__m256i,const int imm8) | v2ti __builtin_ia32_permti256(v2ti,v2ti,int) | vperm2i128 |
AVX(256 bit) int256
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
load | ||||
A | load.a32 | __m256i _mm256_load_si256(__m256i const*mem_addr) | (assign) | vmovdqa |
A2 | load.a32.nt | __m256i _mm256_stream_load_si256(__m256i const*mem_addr) | v4di __builtin_ia32_movntdqa256(const v4di*) | vmovntdqa |
A | load.u | __m256i _mm256_loadu_si256(__m256i const*mem_addr) | v32qi __builtin_ia32_loaddqu256(pcchar) | vmovdqu |
A | load.u.fast | __m256i _mm256_lddqu_si256(__m256i const*mem_addr) | v32qi __builtin_ia32_lddqu256(pcchar) | vlddqu |
store | ||||
A | store.a32 | void _mm256_store_si256(__m256i*mem_addr,__m256i) | (assign) | vmovdqa |
A | store.a32.nt | void _mm256_stream_si256(__m256i*mem_addr,__m256i) | void __builtin_ia32_movntdq256(v4di*,v4di) | vmovntdq |
A | store.u | void _mm256_storeu_si256(__m256i*mem_addr,__m256i) | void __builtin_ia32_storedqu256(pchar,v32qi) | vmovdqu |
convert | ||||
A | create ⟨a,b⟩ | __m256i _mm256_set_m128i(__m128i hi,__m128i lo) | v8si __builtin_ia32_vinsertf128_si256( |
vinsertf128 |
A | create ⟨b,a⟩ | __m256i _mm256_setr_m128i(__m128i lo,__m128i hi) | v8si __builtin_ia32_vinsertf128_si256( |
vinsertf128 |
logical | ||||
A2 | ∧ | __m256i _mm256_and_si256(__m256i,__m256i) | (v4du&v4du) | vpand |
A2 | ¬∧ | __m256i _mm256_andnot_si256(__m256i,__m256i) | (~v4du&v4du) | vpandn |
A2 | ∨ | __m256i _mm256_or_si256(__m256i,__m256i) | (v4du|v4du) | vpor |
A2 | ⊕ | __m256i _mm256_xor_si256(__m256i,__m256i) | (v4du^v4du) | vpxor |
compare | ||||
A | a≊0 (masked by b) | int _mm256_testz_si256(__m256i,__m256i) | int __builtin_ia32_ptestz256(v4di,v4di) | vptest |
A | a≊1 (masked by b) | int _mm256_testc_si256(__m256i,__m256i) | int __builtin_ia32_ptestc256(v4di,v4di) | vptest |
A | a≇0 ∧ a≇1 (masked by b) | int _mm256_testnzc_si256(__m256i,__m256i) | int __builtin_ia32_ptestnzc256(v4di,v4di) | vptest |
AVX(256 bit) float × 8
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
A | const(0) | __m256 _mm256_setzero_ps(void) | (cast) | vxorps |
load | ||||
A | load.a32 | __m256 _mm256_load_ps(float const*mem_addr) | (assign) | vmovaps |
A | load.u | __m256 _mm256_loadu_ps(float const*mem_addr) | v8sf __builtin_ia32_loadups256(pcfloat) | vmovups |
A | load ⟨m0,…,m0⟩ | __m256 _mm256_broadcast_ss(float const*mem_addr) | v8sf __builtin_ia32_vbroadcastss256(pcfloat) | vbroadcastss |
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m256 _mm256_i32gather_ps(float const*base_addr,__m256i vindex,const int scale) | v8sf __builtin_ia32_gathersiv8sf(0,float const*,v8si,0~,int) | vgatherdps |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m256 _mm256_mask_i32gather_ps(__m256 src,float const*base_addr,__m256i vindex,__m256 mask,const int scale) | v8sf __builtin_ia32_gathersiv8sf(v8sf,float const*,v8si,v8sf,int) | vgatherdps |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m128 _mm256_i64gather_ps(float const*base_addr,__m256i vindex,const int scale) | v8sf __builtin_ia32_gatherdiv4sf256(0,float const*,v4di,~0,int) | vgatherqps |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m128 _mm256_mask_i64gather_ps(__m128 src,float const*base_addr,__m256i vindex,__m128 mask,const int scale) | v8sf __builtin_ia32_gatherdiv4sf256(v4sf,float const*,v4di,v4sf,int) | vgatherqps |
A | load ⟨bi<0 ? mi : 0⟩ | __m256 _mm256_maskload_ps(float const*mem_addr,__m256i mask) | v8sf __builtin_ia32_maskloadps256(pcv8sf,v8sf) | vmaskmovps |
A | load ⟨m3…m0,m3…m0⟩ | __m256 _mm256_broadcast_ps(__m128 const*mem_addr) | v8sf __builtin_ia32_vbroadcastf128_ps256(pcv4sf) | vbroadcastf128 |
store | ||||
A | store.a32 | void _mm256_store_ps(float*mem_addr,__m256) | (assign) | vmovaps |
A | store.a32.nt | void _mm256_stream_ps(float*mem_addr,__m256) | void __builtin_ia32_movntps256(float*,v8sf) | vmovntps |
A | store.u | void _mm256_storeu_ps(float*mem_addr,__m256) | void __builtin_ia32_storeups256(pfloat,v8sf) | vmovups |
A | store ⟨mi ≔ bi if ci<0⟩ | void _mm256_maskstore_ps(float*mem_addr,__m256i mask,__m256) | void __builtin_ia32_maskstoreps256(pv8sf,v8sf,v8sf) | vmaskmovps |
convert | ||||
A | create ⟨?,?,?,?,a3:0⟩ | __m256 _mm256_castps128_ps256(__m128) | v8sf __builtin_ia32_ps256_ps(v4sf) | - |
A | create ⟨a3:0,b3:0⟩ | __m256 _mm256_set_m128(__m128 hi,__m128 lo) | v8sf __builtin_ia32_vinsertf128_ps256( |
vinsertf128 |
A | create ⟨b3:0,a3:0⟩ | __m256 _mm256_setr_m128(__m128 lo,__m128 hi) | v8sf __builtin_ia32_vinsertf128_ps256( |
vinsertf128 |
A | convert← i32[8] | __m256 _mm256_cvtepi32_ps(__m256i) | v8sf __builtin_ia32_cvtdq2ps256(v8si) | vcvtdq2ps |
A | convert→ i32[8] | __m256i _mm256_cvtps_epi32(__m256) | v8si __builtin_ia32_cvtps2dq256(v8sf) | vcvtps2dq |
A | convert.t→ i32[8] | __m256i _mm256_cvttps_epi32(__m256) | v8si __builtin_ia32_cvttps2dq256(v8sf) | vcvttps2dq |
insert | ||||
A | a4i+3:4i ≔ b3:0 | __m256 _mm256_insertf128_ps(__m256,__m128,int imm8) | v8sf __builtin_ia32_vinsertf128_ps256(v8sf,v4sf,int) | vinsertf128 |
extract | ||||
A | extract a0 f32 | float _mm256_cvtss_f32(__m256) | (cast) | vmovss |
A | extract a3:0 f32[4] | __m128 _mm256_castps256_ps128(__m256) | v4sf __builtin_ia32_ps_ps256(v8sf) | - |
A | extract a4i+3:4i f32[4] | __m128 _mm256_extractf128_ps(__m256,const int imm8) | v4sf __builtin_ia32_vextractf128_ps256(v8sf,int) | vextractf128 |
A | extract signbits | int _mm256_movemask_ps(__m256) | int __builtin_ia32_movmskps256(v8sf) | vmovmskps |
add | ||||
A | add | __m256 _mm256_add_ps(__m256,__m256) | v8sf __builtin_ia32_addps256(v8sf,v8sf) | vaddps |
A | add ⟨b6+b7,b4+b5,b2+b3,b0+b1, a6+a7,a4+a5,a2+a3,a0+a1⟩ |
__m256 _mm256_hadd_ps(__m256,__m256) | v8sf __builtin_ia32_haddps256(v8sf,v8sf) | vhaddps |
sub | ||||
A | sub | __m256 _mm256_sub_ps(__m256,__m256) | v8sf __builtin_ia32_subps256(v8sf,v8sf) | vsubps |
A | sub ⟨b6–b7,b4–b5,b2–b3,b0–b1, a6–a7,a4–a5,a2–a3,a0–a1⟩ |
__m256 _mm256_hsub_ps(__m256,__m256) | v8sf __builtin_ia32_hsubps256(v8sf,v8sf) | vhsubps |
A | sub @even, add @odd | __m256 _mm256_addsub_ps(__m256,__m256) | v8sf __builtin_ia32_addsubps256(v8sf,v8sf) | vaddsubps |
mul | ||||
A | mul | __m256 _mm256_mul_ps(__m256,__m256) | v8sf __builtin_ia32_mulps256(v8sf,v8sf) | vmulps |
A | dj∊{∑(ai×bi×{1,0},0} 128 bit lanes |
__m256 _mm256_dp_ps(__m256,__m256,const int imm8) | v8sf __builtin_ia32_dpps256(v8sf,v8sf,int) | vdpps |
div | ||||
A | div | __m256 _mm256_div_ps(__m256,__m256) | v8sf __builtin_ia32_divps256(v8sf,v8sf) | vdivps |
math | ||||
A | min | __m256 _mm256_min_ps(__m256,__m256) | v8sf __builtin_ia32_minps256(v8sf,v8sf) | vminps |
A | max | __m256 _mm256_max_ps(__m256,__m256) | v8sf __builtin_ia32_maxps256(v8sf,v8sf) | vmaxps |
A | floor() | __m256 _mm256_floor_ps(__m256) | v8sf __builtin_ia32_roundps256(v8sf,int) | vroundps |
A | ceil() | __m256 _mm256_ceil_ps(__m256) | v8sf __builtin_ia32_roundps256(v8sf,int) | vroundps |
A | round() | __m256 _mm256_round_ps(__m256,int rounding) | v8sf __builtin_ia32_roundps256(v8sf,int) | vroundps |
A | 1/x | __m256 _mm256_rcp_ps(__m256) | v8sf __builtin_ia32_rcpps256(v8sf) | vrcpps |
A | √x | __m256 _mm256_sqrt_ps(__m256) | v8sf __builtin_ia32_sqrtps256(v8sf) v8sf __builtin_ia32_sqrtps_nr256(v8sf) |
vsqrtps |
A | 1/√x | __m256 _mm256_rsqrt_ps(__m256) | v8sf __builtin_ia32_rsqrtps256(v8sf) v8sf __builtin_ia32_rsqrtps_nr256(v8sf) |
vrsqrtps |
logical | ||||
A | ∧ | __m256 _mm256_and_ps(__m256,__m256) | v8sf __builtin_ia32_andps256(v8sf,v8sf) | vandps |
A | ¬∧ | __m256 _mm256_andnot_ps(__m256,__m256) | v8sf __builtin_ia32_andnps256(v8sf,v8sf) | vandnps |
A | ∨ | __m256 _mm256_or_ps(__m256,__m256) | v8sf __builtin_ia32_orps256(v8sf,v8sf) | vorps |
A | ⊕ | __m256 _mm256_xor_ps(__m256,__m256) | v8sf __builtin_ia32_xorps256(v8sf,v8sf) | vxorps |
compare | ||||
A | compare(op) | __m256 _mm256_cmp_ps(__m256,__m256,const int imm8) | v8sf __builtin_ia32_cmpps256(v8sf,v8sf,int) | vcmpps |
shuffle | ||||
A2 | dup ⟨a0,…,a0⟩ | __m256 _mm256_broadcastss_ps(__m128) | v8sf __builtin_ia32_vbroadcastss_ps256(v4sf) | vbroadcastss |
A | dup ⟨a6,a6,a4,a4,a2,a2,a0,a0⟩ | __m256 _mm256_moveldup_ps(__m256) | v8sf __builtin_ia32_movsldup256(v8sf) | vmovsldup |
A | dup ⟨a7,a7,a5,a5,a3,a3,a1,a1⟩ | __m256 _mm256_movehdup_ps(__m256) | v8sf __builtin_ia32_movshdup256(v8sf) | vmovshdup |
A | interleave ⟨b3,a3,b2,a2,b1,a1,b0,a0⟩ |
__m256 _mm256_unpacklo_ps(__m256,__m256) | v8sf __builtin_ia32_unpcklps256(v8sf,v8sf) | vunpcklps |
A | interleave ⟨b7,a7,b6,a6,b5,a5,b4,a4⟩ |
__m256 _mm256_unpackhi_ps(__m256,__m256) | v8sf __builtin_ia32_unpckhps256(v8sf,v8sf) | vunpckhps |
A | blend ⟨imm8i ? bi : ai⟩ | __m256 _mm256_blend_ps(__m256,__m256,const int imm8) | v8sf __builtin_ia32_blendps256(v8sf,v8sf,int) | vblendps |
A | blend ⟨ci<0 ? bi : ai⟩ | __m256 _mm256_blendv_ps(__m256,__m256,__m256 mask) | v8sf __builtin_ia32_blendvps256(v8sf,v8sf,v8sf) | vblendvps |
A | shuffle ⟨a.hil,a.hik,a.hij,a.hii⟩ ⟨a.lol,a.lok,a.loj,a.loi⟩ TODO replace some a by b ⚠ |
__m256 _mm256_shuffle_ps(__m256,__m256,const int imm8) | v8sf __builtin_ia32_shufps256(v8sf,v8sf,int) | vshufps |
A | shuffle ⟨a.hil,a.hik,a.hij,a.hii⟩ ⟨a.lol,a.lok,a.loj,a.loi⟩ |
__m256 _mm256_permute_ps(__m256,int imm8) | v8sf __builtin_ia32_vpermilps256(v8sf,int) | vpermilps |
A | shuffle ⟨a.hib7,a.hib6,a.hib5,a.hib4⟩ ⟨a.lob3,a.lob2,a.lob1,a.lob0⟩ |
__m256 _mm256_permutevar_ps(__m256,__m256i) | v8sf __builtin_ia32_vpermilvarps256(v8sf,v8si) | vpermilps |
A | shuffle 128 bit with zero ⟨xj,xi⟩, x∊{0,b.hi,b.lo,a.hi,a.lo} |
__m256 _mm256_permute2f128_ps(__m256,__m256,int imm8) | v8sf __builtin_ia32_vperm2f128_ps256(v8sf,v8sf,int) | vperm2f128 |
A2 | shuffle ⟨di ≔ abi⟩ | __m256 _mm256_permutevar8x32_ps(__m256,__m256i idx) | v8sf __builtin_ia32_permvarsf256(v8sf,v8sf) | vpermps |
compare | ||||
A | a≊0 (masked by b) | int _mm256_testz_ps(__m256,__m256) | int __builtin_ia32_vtestzps256(v8sf,v8sf) | vtestps |
A | a≊1 (masked by b) | int _mm256_testc_ps(__m256,__m256) | int __builtin_ia32_vtestcps256(v8sf,v8sf) | vtestps |
A | a≇0 ∧ a≇1 (masked by b) | int _mm256_testnzc_ps(__m256,__m256) | int __builtin_ia32_vtestnzcps256(v8sf,v8sf) | vtestps |
AVX(256 bit) double × 4
Fg | Function | Intel | Gnu | Asm |
---|---|---|---|---|
constant | ||||
A | const(0) | __m256d _mm256_setzero_pd(void) | (cast) | vxorpd |
load | ||||
A | load.a32 | __m256d _mm256_load_pd(double const*mem_addr) | (assign) | vmovapd |
A | load.u | __m256d _mm256_loadu_pd(double const*mem_addr) | v4df __builtin_ia32_loadupd256(pcdouble) | vmovupd |
A | load ⟨m0,m0,m0,m0⟩ | __m256d _mm256_broadcast_sd(double const*mem_addr) | v4df __builtin_ia32_vbroadcastsd256(pcdouble) | vbroadcastsd |
A | load ⟨m1,m0,m1,m0⟩ | __m256d _mm256_broadcast_pd(__m128d const*mem_addr) | v4df __builtin_ia32_vbroadcastf128_pd256(pcv2df) | vbroadcastf128 |
A2 | gather ofs:i32 ⟨mofsi⟩ |
__m256d _mm256_i32gather_pd(double const*base_addr,__m128i vindex,const int scale) | v4df __builtin_ia32_gathersiv4df(0,double const*,v4si,~0,int) | vgatherdpd |
A2 | gather ofs:i32 ⟨di<0?mofsi:ai⟩ |
__m256d _mm256_mask_i32gather_pd(__m256d src,double const*base_addr,__m128i vindex,__m256d mask,const int scale) | v4df __builtin_ia32_gathersiv4df(v4df,double const*,v4si,v4df,int) | vgatherdpd |
A2 | gather ofs:i64 ⟨mofsi⟩ |
__m256d _mm256_i64gather_pd(double const*base_addr,__m256i vindex,const int scale) | v4df __builtin_ia32_gatherdiv4df(0,double const*,v4di,~0,int) | vgatherqpd |
A2 | gather ofs:i64 ⟨di<0?mofsi:ai⟩ |
__m256d _mm256_mask_i64gather_pd(__m256d src,double const*base_addr,__m256i vindex,__m256d mask,const int scale) | v4df __builtin_ia32_gatherdiv4df(v4df,double const*,v4di,v4df,int) | vgatherqpd |
A | load ⟨bi<0 ? mi : 0⟩ | __m256d _mm256_maskload_pd(double const*mem_addr,__m256i mask) | v4df __builtin_ia32_maskloadpd256(pcv4df,v4df) | vmaskmovpd |
store | ||||
A | store.a32 | void _mm256_store_pd(double*mem_addr,__m256d) | (assign) | vmovapd |
A | store.a32.nt | void _mm256_stream_pd(double*mem_addr,__m256d) | void __builtin_ia32_movntpd256(double*,v4df) | vmovntpd |
A | store.u | void _mm256_storeu_pd(double*mem_addr,__m256d) | void __builtin_ia32_storeupd256(pdouble,v4df) | vmovupd |
A | store ⟨mi ≔ bi if ci<0⟩ | void _mm256_maskstore_pd(double*mem_addr,__m256i mask,__m256d) | void __builtin_ia32_maskstorepd256(pv4df,v4df,v4df) | vmaskmovpd |
convert | ||||
A | create ⟨?,?,a1,a0⟩ | __m256d _mm256_castpd128_pd256(__m128d) | v4df __builtin_ia32_pd256_pd(v2df) | - |
A | create ⟨a1,a0,b1,b0⟩ | __m256d _mm256_set_m128d(__m128d hi,__m128d lo) | v4df __builtin_ia32_vinsertf128_pd256( |
vinsertf128 |
A | create ⟨b1,b0,a1,a0⟩ | __m256d _mm256_setr_m128d(__m128d lo,__m128d hi) | v4df __builtin_ia32_vinsertf128_pd256( |
vinsertf128 |
A | convert← i32[4] | __m256d _mm256_cvtepi32_pd(__m128i) | v4df __builtin_ia32_cvtdq2pd256(v4si) | vcvtdq2pd |
A | convert← f32[4] | __m256d _mm256_cvtps_pd(__m128) | v4df __builtin_ia32_cvtps2pd256(v4sf) | vcvtps2pd |
A | convert→ i32[4] | __m128i _mm256_cvtpd_epi32(__m256d) | v4si __builtin_ia32_cvtpd2dq256(v4df) | vcvtpd2dq |
A | convert→ i32[4] trunc | __m128i _mm256_cvttpd_epi32(__m256d) | v4si __builtin_ia32_cvttpd2dq256(v4df) | vcvttpd2dq |
A | convert→ f32[4] | __m128 _mm256_cvtpd_ps(__m256d) | v4sf __builtin_ia32_cvtpd2ps256(v4df) | vcvtpd2ps |
insert | ||||
A | a2i+1:2i ≔ b1:0 | __m256d _mm256_insertf128_pd(__m256d,__m128d,int imm8) | v4df __builtin_ia32_vinsertf128_pd256(v4df,v2df,int) | vinsertf128 |
extract | ||||
A | extract a0 f64 | double _mm256_cvtsd_f64(__m256d) | v4df[0] | vmovsd |
A | extract a1:0 f64[2] | __m128d _mm256_extractf128_pd(__m256d,const int imm8) | v2df __builtin_ia32_pd_pd256(v4df) | vextractf128 |
A | extract a2i+1:2i f64[2] | __m128d _mm256_extractf128_pd(__m256d,const int imm8) | v2df __builtin_ia32_vextractf128_pd256(v4df,int) | vextractf128 |
A | extract signbits | int _mm256_movemask_pd(__m256d) | int __builtin_ia32_movmskpd256(v4df) | vmovmskpd |
add | ||||
A | add | __m256d _mm256_add_pd(__m256d,__m256d) | v4df __builtin_ia32_addpd256(v4df,v4df) | vaddpd |
A | add ⟨b6+b7,b4+b5,b2+b3,b0+b1, a6+a7,a4+a5,a2+a3,a0+a1⟩ |
__m256d _mm256_hadd_pd(__m256d,__m256d) | v4df __builtin_ia32_haddpd256(v4df,v4df) | vhaddpd |
sub | ||||
A | sub | __m256d _mm256_sub_pd(__m256d,__m256d) | v4df __builtin_ia32_subpd256(v4df,v4df) | vsubpd |
A | sub ⟨b6–b7,b4–b5,b2–b3,b0–b1, a6–a7,a4–a5,a2–a3,a0–a1⟩ |
__m256d _mm256_hsub_pd(__m256d,__m256d) | v4df __builtin_ia32_hsubpd256(v4df,v4df) | vhsubpd |
A | sub @even, add @odd | __m256d _mm256_addsub_pd(__m256d,__m256d) | v4df __builtin_ia32_addsubpd256(v4df,v4df) | vaddsubpd |
mul | ||||
A | mul | __m256d _mm256_mul_pd(__m256d,__m256d) | v4df __builtin_ia32_mulpd256(v4df,v4df) | vmulpd |
div | ||||
A | div | __m256d _mm256_div_pd(__m256d,__m256d) | v4df __builtin_ia32_divpd256(v4df,v4df) | vdivpd |
math | ||||
A | min | __m256d _mm256_min_pd(__m256d,__m256d) | v4df __builtin_ia32_minpd256(v4df,v4df) | vminpd |
A | max | __m256d _mm256_max_pd(__m256d,__m256d) | v4df __builtin_ia32_maxpd256(v4df,v4df) | vmaxpd |
A | floor() | __m256d _mm256_floor_pd(__m256d) | v4df __builtin_ia32_roundpd256(v4df,int) | vroundpd |
A | ceil() | __m256d _mm256_ceil_pd(__m256d) | v4df __builtin_ia32_roundpd256(v4df,int) | vroundpd |
A | round() | __m256d _mm256_round_pd(__m256d,int rounding) | v4df __builtin_ia32_roundpd256(v4df,int) | vroundpd |
A | √x | __m256d _mm256_sqrt_pd(__m256d) | v4df __builtin_ia32_sqrtpd256(v4df) | vsqrtpd |
logical | ||||
A | ∧ | __m256d _mm256_and_pd(__m256d,__m256d) | v4df __builtin_ia32_andpd256(v4df,v4df) | vandpd |
A | ¬∧ | __m256d _mm256_andnot_pd(__m256d,__m256d) | v4df __builtin_ia32_andnpd256(v4df,v4df) | vandnpd |
A | ∨ | __m256d _mm256_or_pd(__m256d,__m256d) | v4df __builtin_ia32_orpd256(v4df,v4df) | vorpd |
A | ⊕ | __m256d _mm256_xor_pd(__m256d,__m256d) | v4df __builtin_ia32_xorpd256(v4df,v4df) | vxorpd |
compare | ||||
A | compare(op) | __m256d _mm256_cmp_pd(__m256d,__m256d,const int imm8) | v4df __builtin_ia32_cmppd256(v4df,v4df,int) | vcmppd |
A | a≊0 (masked by b) | int _mm256_testz_pd(__m256d,__m256d) | int __builtin_ia32_vtestzpd256(v4df,v4df) | vtestpd |
A | a≊1 (masked by b) | int _mm256_testc_pd(__m256d,__m256d) | int __builtin_ia32_vtestcpd256(v4df,v4df) | vtestpd |
A | a≇0 ∧ a≇1 (masked by b) | int _mm256_testnzc_pd(__m256d,__m256d) | int __builtin_ia32_vtestnzcpd256(v4df,v4df) | vtestpd |
shuffle | ||||
A2 | dup ⟨a0,a0,a0,a0⟩ | __m256d _mm256_broadcastsd_pd(__m128d) | v4df __builtin_ia32_vbroadcastsd_pd256(v2df) | vbroadcastsd |
A | dup ⟨a2,a2,a0,a0⟩ | __m256d _mm256_movedup_pd(__m256d) | v4df __builtin_ia32_movddup256(v4df) | vmovddup |
A | interleave ⟨b2,a2,b0,a0⟩ | __m256d _mm256_unpacklo_pd(__m256d,__m256d) | v4df __builtin_ia32_unpcklpd256(v4df,v4df) | vunpcklpd |
A | interleave ⟨b3,a3,b1,a1⟩ | __m256d _mm256_unpackhi_pd(__m256d,__m256d) | v4df __builtin_ia32_unpckhpd256(v4df,v4df) | vunpckhpd |
A | blend ⟨imm8i ? bi : ai⟩ | __m256d _mm256_blend_pd(__m256d,__m256d,const int imm8) | v4df __builtin_ia32_blendpd256(v4df,v4df,int) | vblendpd |
A | blend ⟨ci<0 ? bi : ai⟩ | __m256d _mm256_blendv_pd(__m256d,__m256d,__m256d mask) | v4df __builtin_ia32_blendvpd256(v4df,v4df,v4df) | vblendvpd |
A2 | shuffle ⟨al,ak,aj,ai⟩ | __m256d _mm256_permute4x64_pd(__m256d,const int imm8) | v4df __builtin_ia32_permdf256(v4df,int) | vpermpd |
A | shuffle ⟨a.hil,a.hik,a.loj,a.loi⟩ | __m256d _mm256_permute_pd(__m256d,int imm8) | v4df __builtin_ia32_vpermilpd256(v4df,int) | vpermilpd |
A | shuffle ⟨b.hil,a.hik,b.loj,a.loi⟩ | __m256d _mm256_shuffle_pd(__m256d,__m256d,const int imm8) | v4df __builtin_ia32_shufpd256(v4df,v4df,int) | vshufpd |
A | shuffle ⟨a.hib3,a.hib2,a.lob1,a.lob0⟩ | __m256d _mm256_permutevar_pd(__m256d,__m256i) | v4df __builtin_ia32_vpermilvarpd256(v4df,v4di) | vpermilpd |
A | shuffle 128 bit with zero ⟨xj,xi⟩, x≔{b.hi,b.lo,a.hi,a.lo} |
__m256d _mm256_permute2f128_pd(__m256d,__m256d,int imm8) | v4df __builtin_ia32_vperm2f128_pd256(v4df,v4df,int) | vperm2f128 |
Constants:
Predicate | Value | Operator | Ordered | Signals |
---|---|---|---|---|
_CMP_EQ_OQ | 0x00 | Equal | ordered | non-signaling |
_CMP_LT_OS | 0x01 | Less-than | ordered | signaling |
_CMP_LE_OS | 0x02 | Less-than-or-equal | ordered | signaling |
_CMP_UNORD_Q | 0x03 | Unordered | Unordered | non-signaling |
_CMP_NEQ_UQ | 0x04 | Not-equal | unordered | non-signaling |
_CMP_NLT_US | 0x05 | Not-less-than | unordered | signaling |
_CMP_NLE_US | 0x06 | Not-less-than-or-equal | unordered | signaling |
_CMP_ORD_Q | 0x07 | Ordered | Ordered | nonsignaling |
_CMP_EQ_UQ | 0x08 | Equal | unordered | non-signaling |
_CMP_NGE_US | 0x09 | Not-greater-than-or-equal | unordered | signaling |
_CMP_NGT_US | 0x0a | Not-greater-than | unordered | signaling |
_CMP_FALSE_OQ | 0x0b | False | ordered | non-signaling |
_CMP_NEQ_OQ | 0x0c | Not-equal | ordered | non-signaling |
_CMP_GE_OS | 0x0d | Greater-than-or-equal | ordered | signaling |
_CMP_GT_OS | 0x0e | Greater-than | ordered | signaling |
_CMP_TRUE_UQ | 0x0f | True | unordered | non-signaling |
_CMP_EQ_OS | 0x10 | Equal | ordered | signaling |
_CMP_LT_OQ | 0x11 | Less-than | ordered | non-signaling |
_CMP_LE_OQ | 0x12 | Less-than-or-equal | ordered | non-signaling |
_CMP_UNORD_S | 0x13 | Unordered | Unordered | signaling |
_CMP_NEQ_US | 0x14 | Not-equal | unordered | signaling |
_CMP_NLT_UQ | 0x15 | Not-less-than | unordered | non-signaling |
_CMP_NLE_UQ | 0x16 | Not-less-than-or-equal | unordered | non-signaling |
_CMP_ORD_S | 0x17 | Ordered | Ordered | signaling |
_CMP_EQ_US | 0x18 | Equal | unordered | signaling |
_CMP_NGE_UQ | 0x19 | Not-greater-than-or-equal | unordered | non-signaling |
_CMP_NGT_UQ | 0x1a | Not-greater-than | unordered | non-signaling |
_CMP_FALSE_OS | 0x1b | False | ordered | signaling |
_CMP_NEQ_OS | 0x1c | Not-equal | ordered | signaling |
_CMP_GE_OQ | 0x1d | Greater-than-or-equal | ordered | non-signaling |
_CMP_GT_OQ | 0x1e | Greater-than | ordered | non-signaling |
_CMP_TRUE_US | 0x1f | True | unordered | signaling |
Predicate | Value |
---|---|
_MM_FROUND_TO_NEAREST_INT | 0x00 |
_MM_FROUND_TO_NEG_INF | 0x01 |
_MM_FROUND_TO_POS_INF | 0x02 |
_MM_FROUND_TO_ZERO | 0x03 |
_MM_FROUND_CUR_DIRECTION | 0x04 |
_MM_FROUND_RAISE_EXC | 0x00 |
_MM_FROUND_NO_EXC | 0x08 |
_MM_FROUND_NINT | 0x00 |
_MM_FROUND_FLOOR | 0x01 |
_MM_FROUND_CEIL | 0x02 |
_MM_FROUND_TRUNC | 0x03 |
_MM_FROUND_RINT | 0x04 |
Predicate | Value |
---|---|
MM_HINT_ET0 | 7 |
MM_HINT_ET1 | 6 |
MM_HINT_T0 | 3 |
MM_HINT_T1 | 2 |
MM_HINT_T2 | 1 |
MM_HINT_NTA | 0 |
References:
Flag | Includes | Comment | |
---|---|---|---|
Local (v9) | Github (v10) | ||
MMX | mmintrin.h | mmintrin.h | 64 bit float |
SSE | xmmintrin.h | xmmintrin.h | 128 bit float |
SSE2 | emmintrin.h | emmintrin.h | 128 bit integer/double |
SSE3 | pmmintrin.h | pmmintrin.h | |
SSSE3 | tmmintrin.h | tmmintrin.h | |
SSE4_1 | smmintrin.h | smmintrin.h | Math extensions |
SSE4_2 | CRC, String compare | ||
AVX | avxintrin.h | avxintrin.h | 256 bit float |
AVX2 | avx2intrin.h | avx2intrin.h | 256 bit integer/double |
BMI(1) | bmiintrin.h | bmiintrin.h | Extension: 1st group bit manipulation extension |
BMI2 | bmi2intrin.h | bmi2intrin.h | Extension: 2nd group bit manipulation extension |
FP16C | f16cintrin.h | f16cintrin.h | float 16 conversions |
FMA(+AVX512F) | fmaintrin.h | fmaintrin.h | Fused multiply-add |
AES(+SSE2) | wmmintrin.h | wmmintrin.h | AES instructions |
LZCNT | lzcntintrin.h | lzcntintrin.h | Count leading zeros |
RTM | rtmintrin.h | rtmintrin.h | Restricted Transactional Memory |
RTM | xtestintrin.h | xtestintrin.h | Restricted Transactional Memory |
SHA | shaintrin.h | shaintrin.h | Intel SHA extensions |