The OpenD Programming Language

inteli.emmintrin

Public Imports

inteli.types
public import inteli.types;
Undocumented in source.
inteli.xmmintrin
public import inteli.xmmintrin;
Undocumented in source.

Members

Aliases

_mm_bslli_si128
alias _mm_bslli_si128 = _mm_slli_si128

Shift a left by bytes bytes while shifting in zeros.

_mm_bsrli_si128
alias _mm_bsrli_si128 = _mm_srli_si128

Shift v right by bytes bytes while shifting in zeros.

Functions

_mm_add_epi16
__m128i _mm_add_epi16(__m128i a, __m128i b)

Add packed 16-bit integers in a and b.

_mm_add_epi32
__m128i _mm_add_epi32(__m128i a, __m128i b)

Add packed 32-bit integers in a and b.

_mm_add_epi64
__m128i _mm_add_epi64(__m128i a, __m128i b)

Add packed 64-bit integers in a and b.

_mm_add_epi8
__m128i _mm_add_epi8(__m128i a, __m128i b)

Add packed 8-bit integers in a and b.

_mm_add_pd
__m128d _mm_add_pd(__m128d a, __m128d b)

Add packed double-precision (64-bit) floating-point elements in a and b.

_mm_add_sd
__m128d _mm_add_sd(__m128d a, __m128d b)

Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of destination.

_mm_add_si64
__m64 _mm_add_si64(__m64 a, __m64 b)

Add 64-bit integers a and b.

_mm_adds_epi16
__m128i _mm_adds_epi16(__m128i a, __m128i b)

Add packed 16-bit integers in a and b using signed saturation.

_mm_adds_epi8
__m128i _mm_adds_epi8(__m128i a, __m128i b)

Add packed 8-bit signed integers in a and b using signed saturation.

_mm_adds_epu16
__m128i _mm_adds_epu16(__m128i a, __m128i b)

Add packed unsigned 16-bit integers in a and b using unsigned saturation.

_mm_adds_epu8
__m128i _mm_adds_epu8(__m128i a, __m128i b)

Add packed 8-bit unsigned integers in a and b using unsigned saturation.

_mm_and_pd
__m128d _mm_and_pd(__m128d a, __m128d b)

Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b.

_mm_and_si128
__m128i _mm_and_si128(__m128i a, __m128i b)

Compute the bitwise AND of 128 bits (representing integer data) in a and b.

_mm_andnot_pd
__m128d _mm_andnot_pd(__m128d a, __m128d b)

Compute the bitwise NOT of packed double-precision (64-bit) floating-point elements in a and then AND with b.

_mm_andnot_si128
__m128i _mm_andnot_si128(__m128i a, __m128i b)

Compute the bitwise NOT of 128 bits (representing integer data) in a and then AND with b.

_mm_avg_epu16
__m128i _mm_avg_epu16(__m128i a, __m128i b)

Average packed unsigned 16-bit integers in a and b.

_mm_avg_epu8
__m128i _mm_avg_epu8(__m128i a, __m128i b)

Average packed unsigned 8-bit integers in a and b.

_mm_castpd_ps
__m128 _mm_castpd_ps(__m128d a)

Cast vector of type __m128d to type __m128. Note: Also possible with a regular cast(__m128)(a).

_mm_castpd_si128
__m128i _mm_castpd_si128(__m128d a)

Cast vector of type __m128d to type __m128i. Note: Also possible with a regular cast(__m128i)(a).

_mm_castps_pd
__m128d _mm_castps_pd(__m128 a)

Cast vector of type __m128 to type __m128d. Note: Also possible with a regular cast(__m128d)(a).

_mm_castps_si128
__m128i _mm_castps_si128(__m128 a)

Cast vector of type __m128 to type __m128i. Note: Also possible with a regular cast(__m128i)(a).

_mm_castsi128_pd
__m128d _mm_castsi128_pd(__m128i a)

Cast vector of type __m128i to type __m128d. Note: Also possible with a regular cast(__m128d)(a).

_mm_castsi128_ps
__m128 _mm_castsi128_ps(__m128i a)

Cast vector of type __m128i to type __m128. Note: Also possible with a regular cast(__m128)(a).

_mm_clflush
void _mm_clflush(const(void)* p)

Invalidate and flush the cache line that contains p from all levels of the cache hierarchy.

_mm_cmpeq_epi16
__m128i _mm_cmpeq_epi16(__m128i a, __m128i b)

Compare packed 16-bit integers in a and b for equality.

_mm_cmpeq_epi32
__m128i _mm_cmpeq_epi32(__m128i a, __m128i b)

Compare packed 32-bit integers in a and b for equality.

_mm_cmpeq_epi8
__m128i _mm_cmpeq_epi8(__m128i a, __m128i b)

Compare packed 8-bit integers in a and b for equality.

_mm_cmpeq_pd
__m128d _mm_cmpeq_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for equality.

_mm_cmpeq_sd
__m128d _mm_cmpeq_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for equality, store the result in the lower element, and copy the upper element from a.

_mm_cmpge_epi16
__m128i _mm_cmpge_epi16(__m128i a, __m128i b)

Compare packed 16-bit integers elements in a and b for greater-than-or-equal. #BONUS

_mm_cmpge_pd
__m128d _mm_cmpge_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than-or-equal.

_mm_cmpge_sd
__m128d _mm_cmpge_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for greater-than-or-equal, store the result in the lower element, and copy the upper element from a.

_mm_cmpgt_epi16
__m128i _mm_cmpgt_epi16(__m128i a, __m128i b)

Compare packed 16-bit integers in a and b for greater-than.

_mm_cmpgt_epi32
__m128i _mm_cmpgt_epi32(__m128i a, __m128i b)

Compare packed 32-bit integers in a and b for greater-than.

_mm_cmpgt_epi8
__m128i _mm_cmpgt_epi8(__m128i a, __m128i b)

Compare packed 8-bit integers in a and b for greater-than.

_mm_cmpgt_pd
__m128d _mm_cmpgt_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than.

_mm_cmpgt_sd
__m128d _mm_cmpgt_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for greater-than, store the result in the lower element, and copy the upper element from a.

_mm_cmple_epi16
__m128i _mm_cmple_epi16(__m128i a, __m128i b)

Compare packed 16-bit integers elements in a and b for greater-than-or-equal. #BONUS

_mm_cmple_pd
__m128d _mm_cmple_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal.

_mm_cmple_sd
__m128d _mm_cmple_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, store the result in the lower element, and copy the upper element from a.

_mm_cmplt_epi16
__m128i _mm_cmplt_epi16(__m128i a, __m128i b)

Compare packed 16-bit integers in a and b for less-than.

_mm_cmplt_epi32
__m128i _mm_cmplt_epi32(__m128i a, __m128i b)

Compare packed 32-bit integers in a and b for less-than.

_mm_cmplt_epi8
__m128i _mm_cmplt_epi8(__m128i a, __m128i b)

Compare packed 8-bit integers in a and b for less-than.

_mm_cmplt_pd
__m128d _mm_cmplt_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for less-than.

_mm_cmplt_sd
__m128d _mm_cmplt_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for less-than, store the result in the lower element, and copy the upper element from a.

_mm_cmpneq_pd
__m128d _mm_cmpneq_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal.

_mm_cmpneq_sd
__m128d _mm_cmpneq_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-equal, store the result in the lower element, and copy the upper element from a.

_mm_cmpnge_pd
__m128d _mm_cmpnge_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for not-greater-than-or-equal.

_mm_cmpnge_sd
__m128d _mm_cmpnge_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-greater-than-or-equal, store the result in the lower element, and copy the upper element from a.

_mm_cmpngt_pd
__m128d _mm_cmpngt_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for not-greater-than.

_mm_cmpngt_sd
__m128d _mm_cmpngt_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-greater-than, store the result in the lower element, and copy the upper element from a.

_mm_cmpnle_pd
__m128d _mm_cmpnle_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal.

_mm_cmpnle_sd
__m128d _mm_cmpnle_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, store the result in the lower element, and copy the upper element from a.

_mm_cmpnlt_pd
__m128d _mm_cmpnlt_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than.

_mm_cmpnlt_sd
__m128d _mm_cmpnlt_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b for not-less-than, store the result in the lower element, and copy the upper element from a.

_mm_cmpord_pd
__m128d _mm_cmpord_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN.

_mm_cmpord_sd
__m128d _mm_cmpord_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, store the result in the lower element, and copy the upper element from a to the upper element.

_mm_cmpunord_pd
__m128d _mm_cmpunord_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN.

_mm_cmpunord_sd
__m128d _mm_cmpunord_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b to see if either is NaN, store the result in the lower element, and copy the upper element from a to the upper element.

_mm_comieq_sd
int _mm_comieq_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1).

_mm_comige_sd
int _mm_comige_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1).

_mm_comigt_sd
int _mm_comigt_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1).

_mm_comile_sd
int _mm_comile_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point element in a and b for less-than-or-equal.

_mm_comilt_sd
int _mm_comilt_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1).

_mm_comineq_sd
int _mm_comineq_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1).

_mm_cvtepi32_pd
__m128d _mm_cvtepi32_pd(__m128i a)

Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements.

_mm_cvtepi32_ps
__m128 _mm_cvtepi32_ps(__m128i a)

Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements.

_mm_cvtpd_epi32
__m128i _mm_cvtpd_epi32(__m128d a)

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers.

_mm_cvtpd_pi32
__m64 _mm_cvtpd_pi32(__m128d v)

Convert packed double-precision (64-bit) floating-point elements in v to packed 32-bit integers

_mm_cvtpd_ps
__m128 _mm_cvtpd_ps(__m128d a)

Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements.

_mm_cvtpi32_pd
__m128d _mm_cvtpi32_pd(__m64 v)

Convert packed 32-bit integers in v to packed double-precision (64-bit) floating-point elements.

_mm_cvtps_epi32
__m128i _mm_cvtps_epi32(__m128 a)

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers

_mm_cvtps_pd
__m128d _mm_cvtps_pd(__m128 a)

Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements.

_mm_cvtsd_f64
double _mm_cvtsd_f64(__m128d a)

Copy the lower double-precision (64-bit) floating-point element of a.

_mm_cvtsd_si32
int _mm_cvtsd_si32(__m128d a)

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer.

_mm_cvtsd_si64
long _mm_cvtsd_si64(__m128d a)

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer.

_mm_cvtsd_ss
__m128 _mm_cvtsd_ss(__m128 a, __m128d b)

Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store that in the lower element of result, and copy the upper 3 packed elements from a to the upper elements of result.

_mm_cvtsi128_si32
int _mm_cvtsi128_si32(__m128i a)

Get the lower 32-bit integer in a.

_mm_cvtsi128_si64
long _mm_cvtsi128_si64(__m128i a)

Get the lower 64-bit integer in a.

_mm_cvtsi32_sd
__m128d _mm_cvtsi32_sd(__m128d a, int b)

Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store that in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_cvtsi32_si128
__m128i _mm_cvtsi32_si128(int a)

Copy 32-bit integer a to the lower element of result, and zero the upper elements.

_mm_cvtsi64_sd
__m128d _mm_cvtsi64_sd(__m128d a, long b)

Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_cvtsi64_si128
__m128i _mm_cvtsi64_si128(long a)

Copy 64-bit integer a to the lower element of result, and zero the upper element.

_mm_cvtss_sd
double2 _mm_cvtss_sd(double2 a, float4 b)

Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store that in the lower element of result, and copy the upper element from a to the upper

_mm_cvttpd_epi32
__m128i _mm_cvttpd_epi32(__m128d a)

Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation. Put zeroes in the upper elements of result.

_mm_cvttpd_pi32
__m64 _mm_cvttpd_pi32(__m128d v)

Convert packed double-precision (64-bit) floating-point elements in v to packed 32-bit integers with truncation.

_mm_cvttps_epi32
__m128i _mm_cvttps_epi32(__m128 a)

Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation.

_mm_cvttsd_si32
int _mm_cvttsd_si32(__m128d a)

Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation.

_mm_cvttsd_si64
long _mm_cvttsd_si64(__m128d a)

Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation.

_mm_cvttss_si64
long _mm_cvttss_si64(__m128 a)

Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation.

_mm_div_pd
__m128d _mm_div_pd(__m128d a, __m128d b)

Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b.

_mm_extract_epi16
int _mm_extract_epi16(__m128i v, int index)

Extract a 16-bit integer from v, selected with index. Warning: the returned value is zero-extended to 32-bits.

_mm_insert_epi16
__m128i _mm_insert_epi16(__m128i v, int i, int index)

Copy v, and insert the 16-bit integer i at the location specified by index.

_mm_lfence
void _mm_lfence()

Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.

_mm_load_pd
__m128d _mm_load_pd(const(double)* mem_addr)

Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_load_pd1
__m128d _mm_load_pd1(const(double)* mem_addr)

Load a double-precision (64-bit) floating-point element from memory into both elements of dst. mem_addr does not need to be aligned on any particular boundary.

_mm_load_sd
__m128d _mm_load_sd(const(double)* mem_addr)

Load a double-precision (64-bit) floating-point element from memory into the lower of result, and zero the upper element. mem_addr does not need to be aligned on any particular boundary.

_mm_load_si128
__m128i _mm_load_si128(const(__m128i)* mem_addr)

Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_loadh_pd
__m128d _mm_loadh_pd(__m128d a, const(double)* mem_addr)

Load a double-precision (64-bit) floating-point element from memory into the upper element of result, and copy the lower element from a to result. mem_addr does not need to be aligned on any particular boundary.

_mm_loadl_epi64
__m128i _mm_loadl_epi64(const(__m128i)* mem_addr)

Load 64-bit integer from memory into the first element of result. Zero out the other. Note: strange signature since the memory doesn't have to aligned, and should point to addressable 64-bit, not 128-bit. You may use _mm_loadu_si64 instead.

_mm_loadl_pd
__m128d _mm_loadl_pd(__m128d a, const(double)* mem_addr)

Load a double-precision (64-bit) floating-point element from memory into the lower element of result, and copy the upper element from a to result. mem_addr does not need to be aligned on any particular boundary.

_mm_loadr_pd
__m128d _mm_loadr_pd(const(double)* mem_addr)

Load 2 double-precision (64-bit) floating-point elements from memory into result in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_loadu_pd
__m128d _mm_loadu_pd(const(double)* mem_addr)

Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory. mem_addr does not need to be aligned on any particular boundary.

_mm_loadu_si128
__m128i _mm_loadu_si128(const(__m128i)* mem_addr)

Load 128-bits of integer data from memory. mem_addr does not need to be aligned on any particular boundary.

_mm_loadu_si16
__m128i _mm_loadu_si16(const(void)* mem_addr)

Load unaligned 16-bit integer from memory into the first element, fill with zeroes otherwise.

_mm_loadu_si32
__m128i _mm_loadu_si32(const(void)* mem_addr)

Load unaligned 32-bit integer from memory into the first element of result.

_mm_loadu_si64
__m128i _mm_loadu_si64(const(void)* mem_addr)

Load unaligned 64-bit integer from memory into the first element of result. Upper 64-bit is zeroed.

_mm_madd_epi16
__m128i _mm_madd_epi16(__m128i a, __m128i b)

Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in destination.

_mm_maskmoveu_si128
void _mm_maskmoveu_si128(__m128i a, __m128i mask, void* mem_addr)

Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. mem_addr does not need to be aligned on any particular boundary.

_mm_max_epi16
__m128i _mm_max_epi16(__m128i a, __m128i b)

Compare packed signed 16-bit integers in a and b, and return packed maximum values.

_mm_max_epu8
__m128i _mm_max_epu8(__m128i a, __m128i b)

Compare packed unsigned 8-bit integers in a and b, and return packed maximum values.

_mm_max_pd
__m128d _mm_max_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b, and return packed maximum values.

_mm_max_sd
__m128d _mm_max_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_mfence
void _mm_mfence()

Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.

_mm_min_epi16
__m128i _mm_min_epi16(__m128i a, __m128i b)

Compare packed signed 16-bit integers in a and b, and return packed minimum values.

_mm_min_epu8
__m128i _mm_min_epu8(__m128i a, __m128i b)

Compare packed unsigned 8-bit integers in a and b, and return packed minimum values.

_mm_min_pd
__m128d _mm_min_pd(__m128d a, __m128d b)

Compare packed double-precision (64-bit) floating-point elements in a and b, and return packed minimum values.

_mm_min_sd
__m128d _mm_min_sd(__m128d a, __m128d b)

Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_move_epi64
__m128i _mm_move_epi64(__m128i a)

Copy the lower 64-bit integer in a to the lower element of result, and zero the upper element.

_mm_move_sd
__m128d _mm_move_sd(__m128d a, __m128d b)

Move the lower double-precision (64-bit) floating-point element from b to the lower element of result, and copy the upper element from a to the upper element of dst.

_mm_movemask_epi16
int _mm_movemask_epi16(__m128i a)

Create mask from the most significant bit of each 16-bit element in v. #BONUS

_mm_movemask_epi8
int _mm_movemask_epi8(__m128i a)

Create mask from the most significant bit of each 8-bit element in v.

_mm_movemask_pd
int _mm_movemask_pd(__m128d v)

Set each bit of mask result based on the most significant bit of the corresponding packed double-precision (64-bit) loating-point element in v.

_mm_movepi64_pi64
__m64 _mm_movepi64_pi64(__m128i v)

Copy the lower 64-bit integer in v.

_mm_movpi64_epi64
__m128i _mm_movpi64_epi64(__m64 a)

Copy the 64-bit integer a to the lower element of dest, and zero the upper element.

_mm_mul_epu32
__m128i _mm_mul_epu32(__m128i a, __m128i b)

Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results.

_mm_mul_pd
__m128d _mm_mul_pd(__m128d a, __m128d b)

Multiply packed double-precision (64-bit) floating-point elements in a and b, and return the results.

_mm_mul_sd
__m128d _mm_mul_sd(__m128d a, __m128d b)

Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_mul_su32
__m64 _mm_mul_su32(__m64 a, __m64 b)

Multiply the low unsigned 32-bit integers from a and b, and get an unsigned 64-bit result.

_mm_mulhi_epi16
__m128i _mm_mulhi_epi16(__m128i a, __m128i b)

Multiply the packed signed 16-bit integers in a and b, producing intermediate 32-bit integers, and return the high 16 bits of the intermediate integers.

_mm_mulhi_epu16
__m128i _mm_mulhi_epu16(__m128i a, __m128i b)

Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and return the high 16 bits of the intermediate integers.

_mm_mullo_epi16
__m128i _mm_mullo_epi16(__m128i a, __m128i b)

Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and return the low 16 bits of the intermediate integers.

_mm_not_si128
__m128i _mm_not_si128(__m128i a)

Compute the bitwise NOT of 128 bits in a. #BONUS

_mm_or_pd
__m128d _mm_or_pd(__m128d a, __m128d b)

Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b.

_mm_or_si128
__m128i _mm_or_si128(__m128i a, __m128i b)

Compute the bitwise OR of 128 bits (representing integer data) in a and b.

_mm_packs_epi16
__m128i _mm_packs_epi16(__m128i a, __m128i b)

Convert packed signed 16-bit integers from a and b to packed 8-bit integers using signed saturation.

_mm_packs_epi32
__m128i _mm_packs_epi32(__m128i a, __m128i b)

Convert packed signed 32-bit integers from a and b to packed 16-bit integers using signed saturation.

_mm_packus_epi16
__m128i _mm_packus_epi16(__m128i a, __m128i b)

Convert packed signed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation.

_mm_pause
void _mm_pause()

Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.

_mm_sad_epu8
__m128i _mm_sad_epu8(__m128i a, __m128i b)

Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in result.

_mm_set1_epi16
__m128i _mm_set1_epi16(short a)

Broadcast 16-bit integer a to all elements of dst.

_mm_set1_epi32
__m128i _mm_set1_epi32(int a)

Broadcast 32-bit integer a to all elements.

_mm_set1_epi64
__m128i _mm_set1_epi64(__m64 a)

Broadcast 64-bit integer a to all elements.

_mm_set1_epi64x
__m128i _mm_set1_epi64x(long a)

Broadcast 64-bit integer a to all elements

_mm_set1_epi8
__m128i _mm_set1_epi8(byte a)

Broadcast 8-bit integer a to all elements.

_mm_set_epi16
__m128i _mm_set_epi16(short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)

Set packed 16-bit integers with the supplied values.

_mm_set_epi32
__m128i _mm_set_epi32(int e3, int e2, int e1, int e0)

Set packed 32-bit integers with the supplied values.

_mm_set_epi64
__m128i _mm_set_epi64(__m64 e1, __m64 e0)

Set packed 64-bit integers with the supplied values.

_mm_set_epi64x
__m128i _mm_set_epi64x(long e1, long e0)

Set packed 64-bit integers with the supplied values.

_mm_set_epi8
__m128i _mm_set_epi8(byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0)

Set packed 8-bit integers with the supplied values.

_mm_set_pd
__m128d _mm_set_pd(double e1, double e0)

Set packed double-precision (64-bit) floating-point elements with the supplied values.

_mm_set_pd1
__m128d _mm_set_pd1(double a)

Broadcast double-precision (64-bit) floating-point value a to all element.

_mm_set_sd
__m128d _mm_set_sd(double a)

Copy double-precision (64-bit) floating-point element a to the lower element of result, and zero the upper element.

_mm_setr_epi16
__m128i _mm_setr_epi16(short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)

Set packed 16-bit integers with the supplied values in reverse order.

_mm_setr_epi32
__m128i _mm_setr_epi32(int e3, int e2, int e1, int e0)

Set packed 32-bit integers with the supplied values in reverse order.

_mm_setr_epi64
__m128i _mm_setr_epi64(long e1, long e0)

Set packed 64-bit integers with the supplied values in reverse order.

_mm_setr_epi8
__m128i _mm_setr_epi8(byte e15, byte e14, byte e13, byte e12, byte e11, byte e10, byte e9, byte e8, byte e7, byte e6, byte e5, byte e4, byte e3, byte e2, byte e1, byte e0)

Set packed 8-bit integers with the supplied values in reverse order.

_mm_setr_pd
__m128d _mm_setr_pd(double e1, double e0)

Set packed double-precision (64-bit) floating-point elements with the supplied values in reverse order.

_mm_setzero_pd
__m128d _mm_setzero_pd()

Return vector of type __m128d with all elements set to zero.

_mm_setzero_si128
__m128i _mm_setzero_si128()

Return vector of type __m128i with all elements set to zero.

_mm_shuffle_epi32
__m128i _mm_shuffle_epi32(__m128i a)

Shuffle 32-bit integers in a using the control in imm8.

_mm_shuffle_pd
__m128d _mm_shuffle_pd(__m128d a, __m128d b)

Shuffle double-precision (64-bit) floating-point elements using the control in imm8.

_mm_shufflehi_epi16
__m128i _mm_shufflehi_epi16(__m128i a)

Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of result, with the low 64 bits being copied from from a to result. See also: _MM_SHUFFLE.

_mm_shufflelo_epi16
__m128i _mm_shufflelo_epi16(__m128i a)

Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of result, with the high 64 bits being copied from from a to result.

_mm_sll_epi16
deprecated __m128i _mm_sll_epi16(__m128i a, __m128i count)

Shift packed 16-bit integers in a left by count while shifting in zeros.

_mm_sll_epi32
deprecated __m128i _mm_sll_epi32(__m128i a, __m128i count)

Shift packed 32-bit integers in a left by count while shifting in zeros.

_mm_sll_epi64
deprecated __m128i _mm_sll_epi64(__m128i a, __m128i count)

Shift packed 64-bit integers in a left by count while shifting in zeros.

_mm_slli_epi16
__m128i _mm_slli_epi16(__m128i a, int imm8)

Shift packed 16-bit integers in a left by imm8 while shifting in zeros.

_mm_slli_epi32
__m128i _mm_slli_epi32(__m128i a, int imm8)

Shift packed 32-bit integers in a left by imm8 while shifting in zeros.

_mm_slli_epi64
__m128i _mm_slli_epi64(__m128i a, int imm8)

Shift packed 64-bit integers in a left by imm8 while shifting in zeros.

_mm_slli_si128
__m128i _mm_slli_si128(__m128i op)

Shift a left by bytes bytes while shifting in zeros.

_mm_sqrt_pd
__m128d _mm_sqrt_pd(__m128d vec)

Compute the square root of packed double-precision (64-bit) floating-point elements in vec.

_mm_sqrt_sd
__m128d _mm_sqrt_sd(__m128d a, __m128d b)

Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_sra_epi16
deprecated __m128i _mm_sra_epi16(__m128i a, __m128i count)

Shift packed 16-bit integers in a right by count while shifting in sign bits.

_mm_sra_epi32
deprecated __m128i _mm_sra_epi32(__m128i a, __m128i count)

Shift packed 32-bit integers in a right by count while shifting in sign bits.

_mm_srai_epi16
__m128i _mm_srai_epi16(__m128i a, int imm8)

Shift packed 16-bit integers in a right by imm8 while shifting in sign bits.

_mm_srai_epi32
__m128i _mm_srai_epi32(__m128i a, int imm8)

Shift packed 32-bit integers in a right by imm8 while shifting in sign bits.

_mm_srli_epi16
__m128i _mm_srli_epi16(__m128i a, int imm8)

Shift packed 16-bit integers in a right by imm8 while shifting in zeros.

_mm_srli_epi32
__m128i _mm_srli_epi32(__m128i a, int imm8)

Shift packed 32-bit integers in a right by imm8 while shifting in zeros.

_mm_srli_epi64
__m128i _mm_srli_epi64(__m128i a, int imm8)

Shift packed 64-bit integers in a right by imm8 while shifting in zeros.

_mm_srli_pd
__m128d _mm_srli_pd(__m128d v)

Shift v right by bytes bytes while shifting in zeros. #BONUS

_mm_srli_ps
__m128 _mm_srli_ps(__m128 v)

Shift v right by bytes bytes while shifting in zeros. #BONUS

_mm_srli_si128
__m128i _mm_srli_si128(__m128i v)

Shift v right by bytes bytes while shifting in zeros.

_mm_store_pd
void _mm_store_pd(double* mem_addr, __m128d a)

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_store_pd1
void _mm_store_pd1(double* mem_addr, __m128d a)

Store the lower double-precision (64-bit) floating-point element from a into 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_store_sd
void _mm_store_sd(double* mem_addr, __m128d a)

Store the lower double-precision (64-bit) floating-point element from a into memory. mem_addr does not need to be aligned on any particular boundary.

_mm_store_si128
void _mm_store_si128(__m128i* mem_addr, __m128i a)

Store 128-bits of integer data from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_storeh_pd
void _mm_storeh_pd(double* mem_addr, __m128d a)

Store the upper double-precision (64-bit) floating-point element from a into memory.

_mm_storel_pd
void _mm_storel_pd(double* mem_addr, __m128d a)

Store the lower double-precision (64-bit) floating-point element from a into memory.

_mm_storer_pd
void _mm_storer_pd(double* mem_addr, __m128d a)

Store 2 double-precision (64-bit) floating-point elements from a into memory in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.

_mm_storeu_pd
void _mm_storeu_pd(double* mem_addr, __m128d a)

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr does not need to be aligned on any particular boundary.

_mm_storeu_si128
void _mm_storeu_si128(__m128i* mem_addr, __m128i a)

Store 128-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.

_mm_storeu_si16
void _mm_storeu_si16(void* mem_addr, __m128i a)

Store 16-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

_mm_storeu_si32
void _mm_storeu_si32(void* mem_addr, __m128i a)

Store 32-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

_mm_storeu_si64
void _mm_storeu_si64(void* mem_addr, __m128i a)

Store 64-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.

_mm_stream_pd
void _mm_stream_pd(double* mem_addr, __m128d a)

Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. Note: non-temporal stores should be followed by _mm_sfence() for reader threads.

_mm_stream_si128
void _mm_stream_si128(__m128i* mem_addr, __m128i a)

Store 128-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated. Note: non-temporal stores should be followed by _mm_sfence() for reader threads.

_mm_stream_si32
void _mm_stream_si32(int* mem_addr, int a)

Store 32-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated. Note: non-temporal stores should be followed by _mm_sfence() for reader threads.

_mm_stream_si64
void _mm_stream_si64(long* mem_addr, long a)

Store 64-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated. Note: non-temporal stores should be followed by _mm_sfence() for reader threads.

_mm_sub_epi16
__m128i _mm_sub_epi16(__m128i a, __m128i b)

Subtract packed 16-bit integers in b from packed 16-bit integers in a.

_mm_sub_epi32
__m128i _mm_sub_epi32(__m128i a, __m128i b)

Subtract packed 32-bit integers in b from packed 32-bit integers in a.

_mm_sub_epi64
__m128i _mm_sub_epi64(__m128i a, __m128i b)

Subtract packed 64-bit integers in b from packed 64-bit integers in a.

_mm_sub_epi8
__m128i _mm_sub_epi8(__m128i a, __m128i b)

Subtract packed 8-bit integers in b from packed 8-bit integers in a.

_mm_sub_pd
__m128d _mm_sub_pd(__m128d a, __m128d b)

Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a.

_mm_sub_sd
__m128d _mm_sub_sd(__m128d a, __m128d b)

Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store that in the lower element of result, and copy the upper element from a to the upper element of result.

_mm_sub_si64
__m64 _mm_sub_si64(__m64 a, __m64 b)

Subtract 64-bit integer b from 64-bit integer a.

_mm_subs_epi16
__m128i _mm_subs_epi16(__m128i a, __m128i b)

Subtract packed signed 16-bit integers in b from packed 16-bit integers in a using saturation.

_mm_subs_epi8
__m128i _mm_subs_epi8(__m128i a, __m128i b)

Subtract packed signed 8-bit integers in b from packed 8-bit integers in a using saturation.

_mm_subs_epu16
__m128i _mm_subs_epu16(__m128i a, __m128i b)

Subtract packed 16-bit unsigned integers in a and b using unsigned saturation.

_mm_subs_epu8
__m128i _mm_subs_epu8(__m128i a, __m128i b)

Subtract packed 8-bit unsigned integers in a and b using unsigned saturation.

_mm_undefined_pd
__m128d _mm_undefined_pd()

Return vector of type __m128d with undefined elements.

_mm_undefined_si128
__m128i _mm_undefined_si128()

Return vector of type __m128i with undefined elements.

_mm_unpackhi_epi16
__m128i _mm_unpackhi_epi16(__m128i a, __m128i b)

Unpack and interleave 16-bit integers from the high half of a and b.

_mm_unpackhi_epi32
__m128i _mm_unpackhi_epi32(__m128i a, __m128i b)

Unpack and interleave 32-bit integers from the high half of a and b.

_mm_unpackhi_epi64
__m128i _mm_unpackhi_epi64(__m128i a, __m128i b)

Unpack and interleave 64-bit integers from the high half of a and b.

_mm_unpackhi_epi8
__m128i _mm_unpackhi_epi8(__m128i a, __m128i b)

Unpack and interleave 8-bit integers from the high half of a and b.

_mm_unpackhi_pd
__m128d _mm_unpackhi_pd(__m128d a, __m128d b)

Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b.

_mm_unpacklo_epi16
__m128i _mm_unpacklo_epi16(__m128i a, __m128i b)

Unpack and interleave 16-bit integers from the low half of a and b.

_mm_unpacklo_epi32
__m128i _mm_unpacklo_epi32(__m128i a, __m128i b)

Unpack and interleave 32-bit integers from the low half of a and b.

_mm_unpacklo_epi64
__m128i _mm_unpacklo_epi64(__m128i a, __m128i b)

Unpack and interleave 64-bit integers from the low half of a and b.

_mm_unpacklo_epi8
__m128i _mm_unpacklo_epi8(__m128i a, __m128i b)

Unpack and interleave 8-bit integers from the low half of a and b.

_mm_unpacklo_pd
__m128d _mm_unpacklo_pd(__m128d a, __m128d b)

Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b.

_mm_xor_pd
__m128d _mm_xor_pd(__m128d a, __m128d b)

Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b.

_mm_xor_si128
__m128i _mm_xor_si128(__m128i a, __m128i b)

Compute the bitwise XOR of 128 bits (representing integer data) in a and b.

Meta