- 浏览: 964310 次
文章分类
最新评论
-
l67721363:
感谢分享,要是有各个函数性能比较就好了。
SQL优化 数据库优化 -
hanmiao:
此图片来自QQ空间,未经允许不可引用。
Hacking QQ空间
SSE3和SSSE3 Intrinsics各函数介绍
SIMD相关头文件包括:
//#include <ivec.h>//MMX //#include <fvec.h>//SSE(also include ivec.h) //#include <dvec.h>//SSE2(also include fvec.h) #include <mmintrin.h> //MMX #include <xmmintrin.h> //SSE(include mmintrin.h) #include <emmintrin.h> //SSE2(include xmmintrin.h) #include <pmmintrin.h> //SSE3(include emmintrin.h) #include <tmmintrin.h>//SSSE3(include pmmintrin.h) #include <smmintrin.h>//SSE4.1(include tmmintrin.h) #include <nmmintrin.h>//SSE4.2(include smmintrin.h) #include <wmmintrin.h>//AES(include nmmintrin.h) #include <immintrin.h>//AVX(include wmmintrin.h) #include <intrin.h>//(include immintrin.h)
mmintrin.h为MMX 头文件,其中__m64的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(8) __m64 { unsigned __int64 m64_u64; float m64_f32[2]; __int8 m64_i8[8]; __int16 m64_i16[4]; __int32 m64_i32[2]; __int64 m64_i64; unsigned __int8 m64_u8[8]; unsigned __int16 m64_u16[4]; unsigned __int32 m64_u32[2]; } __m64;
xmmintrin.h为SSE 头文件,此头文件里包含MMX头文件,其中__m128的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 m128_i8[16]; __int16 m128_i16[8]; __int32 m128_i32[4]; __int64 m128_i64[2]; unsigned __int8 m128_u8[16]; unsigned __int16 m128_u16[8]; unsigned __int32 m128_u32[4]; } __m128;
emmintrin.h为SSE2头文件,此头文件里包含SSE头文件,其中__m128i和__m128d的定义为:
typedef union __declspec(intrin_type) _CRT_ALIGN(16) __m128i { __int8 m128i_i8[16]; __int16 m128i_i16[8]; __int32 m128i_i32[4]; __int64 m128i_i64[2]; unsigned __int8 m128i_u8[16]; unsigned __int16 m128i_u16[8]; unsigned __int32 m128i_u32[4]; unsigned __int64 m128i_u64[2]; } __m128i; typedef struct __declspec(intrin_type) _CRT_ALIGN(16) __m128d { double m128d_f64[2]; } __m128d;
pmmintrin.h为SSE3头文件,其文件中各函数的介绍:
/*New Single precision vector instructions*/ //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-b0, r1=a1+b1, r2=a2-b2, r3=a3+b3 extern __m128 _mm_addsub_ps(__m128 a, __m128 b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3 extern __m128 _mm_hadd_ps(__m128 a, __m128 b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3 extern __m128 _mm_hsub_ps(__m128 a, __m128 b); //a=(a0, a1, a2, a3), 则r0=a1, r1=a1, r2=a3, r3=a3 extern __m128 _mm_movehdup_ps(__m128 a); //a=(a0, a1, a2, a3), 则r0=a0, r1=a0, r2=a2, r3=a2 extern __m128 _mm_moveldup_ps(__m128 a); /*New double precision vector instructions*/ //a=(a0, a1), b=(b0, b1), 则r0=a0-b0, r1=a1+b1 extern __m128d _mm_addsub_pd(__m128d a, __m128d b); //a=(a0, a1), b=(b0, b1), 则r0=a0+a1, r1=b0+b1 extern __m128d _mm_hadd_pd(__m128d a, __m128d b); //a=(a0, a1), b=(b0, b1), 则r0=a0-a1, r1=b0-b1 extern __m128d _mm_hsub_pd(__m128d a, __m128d b); //r0=r1=dp[0] extern __m128d _mm_loaddup_pd(double const * dp); //a=(a0, a1),则r0=r1=a0 extern __m128d _mm_movedup_pd(__m128d a); /*New unaligned integer vector load instruction*/ //load unaligned data using _mm_lddqu_si128 for best performance //If the address is not 16-byte aligned, the load begins at the //highest 16-byte-aligned address less than the address of Data extern __m128i _mm_lddqu_si128(__m128i const *p); /*Miscellaneous new instructions, For _mm_monitor p goes in eax, extensions goes in ecx, hints goes in edx*/ //The monitor instruction sets up an address range for hardware monitoring. //The values of extensions and hints correspond to the values in ECX and EDX //used by the monitor instruction. They are reserved for future use and should //be zero for the SSE3-enabled processor. For more information, //see the Intel or AMD documentation as appropriate. extern void _mm_monitor(void const *p, unsigned extensions, unsigned hints); /*Miscellaneous new instructions, For _mm_mwait, extensions goes in ecx, hints goes in eax*/ //The mwait instruction instructs the processor to enter a wait state in which the //processor is instructed to monitor the address range between extensions and hints //and wait for an event or a store to that address range. The values of extensions //and hints are loaded into the ECX and EAX registers. For more information, //see the Intel or AMD documentation as appropriate. extern void _mm_mwait(unsigned extensions, unsigned hints);
tmmintrin.h为SSSE3头文件,其文件中各函数的介绍:
/*Add horizonally packed [saturated] words, double words, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=a0+a1, r1=a2+a3, r2=a4+a5, r3=a6+a7, r4=b0+b1, r5=b2+b3, r6=b4+b5, r7=b6+b7 extern __m128i _mm_hadd_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3 extern __m128i _mm_hadd_epi32 (__m128i a, __m128i b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=SATURATE_16(a0+a1), ..., r3=SATURATE_16(a6+a7), //r4=SATURATE_16(b0+b1), ..., r7=SATURATE_16(b6+b7) extern __m128i _mm_hadds_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0+a1, r1=a2+a3, r2=b0+b1, r3=b2+b3 extern __m64 _mm_hadd_pi16 (__m64 a, __m64 b); //a=(a0, a1), b=(b0, b1), 则r0=a0+a1, r1=b0+b1 extern __m64 _mm_hadd_pi32 (__m64 a, __m64 b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=SATURATE_16(a0+a1), r1=SATURATE_16(a2+a3), //r2=SATURATE_16(b0+b1), r3=SATURATE_16(b2+b3) extern __m64 _mm_hadds_pi16 (__m64 a, __m64 b); /*Subtract horizonally packed [saturated] words, double words, {X,}MM2/m{128,64} (b) from {X,}MM1 (a).*/ //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=a0-a1, r1=a2-a3, r2=a4-a5, r3=a6-a7, r4=b0-b1, r5=b2-b3, r6=b4-b5, r7=b6-b7 extern __m128i _mm_hsub_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3 extern __m128i _mm_hsub_epi32 (__m128i a, __m128i b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=SATURATE_16(a0-a1), ..., r3=SATURATE_16(a6-a7), //r4=SATURATE_16(b0-b1), ..., r7=SATURATE_16(b6-b7) extern __m128i _mm_hsubs_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=a0-a1, r1=a2-a3, r2=b0-b1, r3=b2-b3 extern __m64 _mm_hsub_pi16 (__m64 a, __m64 b); //a=(a0, a1), b=(b0, b1), 则r0=a0-a1, r1=b0-b1 extern __m64 _mm_hsub_pi32 (__m64 a, __m64 b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=SATURATE_16(a0-a1), r1=SATURATE_16(a2-a3), //r2=SATURATE_16(b0-b1), r3=SATURATE_16(b2-b3) extern __m64 _mm_hsubs_pi16 (__m64 a, __m64 b); /*Multiply and add packed words, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, ..., a13, a14, a15), b=(b0, b1, b2, ..., b13, b14, b15) //则r0=SATURATE_16((a0*b0)+(a1*b1)), ..., r7=SATURATE_16((a14*b14)+(a15*b15)) //Parameter a contains unsigned bytes. Parameter b contains signed bytes. extern __m128i _mm_maddubs_epi16 (__m128i a, __m128i b); //SATURATE_16(x) is ((x > 32767) ? 32767 : ((x < -32768) ? -32768 : x)) //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=SATURATE_16((a0*b0)+(a1*b1)), ..., r3=SATURATE_16((a6*b6)+(a7*b7)) //Parameter a contains unsigned bytes. Parameter b contains signed bytes. extern __m64 _mm_maddubs_pi16 (__m64 a, __m64 b); /*Packed multiply high integers with round and scaling, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=INT16(((a0*b0)+0x4000) >> 15), ..., r7=INT16(((a7*b7)+0x4000) >> 15) extern __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=INT16(((a0*b0)+0x4000) >> 15), ..., r3=INT16(((a3*b3)+0x4000) >> 15) extern __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b); /*Packed shuffle bytes {X,}MM2/m{128,64} (b) by {X,}MM1 (a).*/ //SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter //is the least significant 8-bits, b=(b0, b1, b2, ..., b13, b14, b15), b is mask //则r0 = (b0 & 0x80) ? 0 : SELECT(a, b0 & 0x0f), ..., //r15 = (b15 & 0x80) ? 0 : SELECT(a, b15 & 0x0f) extern __m128i _mm_shuffle_epi8 (__m128i a, __m128i b); //SELECT(a, n) extracts the nth 8-bit parameter from a. The 0th 8-bit parameter //is the least significant 8-bits, b=(b0, b1, ..., b7), b is mask //则r0= (b0 & 0x80) ? 0 : SELECT(a, b0 & 0x07),..., //r7=(b7 & 0x80) ? 0 : SELECT(a, b7 & 0x07) extern __m64 _mm_shuffle_pi8 (__m64 a, __m64 b); /*Packed byte, word, double word sign, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, ..., a13, a14, a15), b=(b0, b1, b2, ..., b13, b14, b15) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r15= (b15 < 0) ? -a15 : ((b15 == 0) ? 0 : a15) extern __m128i _mm_sign_epi8 (__m128i a, __m128i b); //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r7= (b7 < 0) ? -a7 : ((b7 == 0) ? 0 : a7) extern __m128i _mm_sign_epi16 (__m128i a, __m128i b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r3= (b3 < 0) ? -a3 : ((b3 == 0) ? 0 : a3) extern __m128i _mm_sign_epi32 (__m128i a, __m128i b); //a=(a0, a1, a2, a3, a4, a5, a6, a7), b=(b0, b1, b2, b3, b4, b5, b6, b7) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r7= (b7 < 0) ? -a7 : ((b7 == 0) ? 0 : a7) extern __m64 _mm_sign_pi8 (__m64 a, __m64 b); //a=(a0, a1, a2, a3), b=(b0, b1, b2, b3) //则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), ..., //r3= (b3 < 0) ? -a3 : ((b3 == 0) ? 0 : a3) extern __m64 _mm_sign_pi16 (__m64 a, __m64 b); //a=(a0, a1), b=(b0, b1), 则r0=(b0 < 0) ? -a0 : ((b0 == 0) ? 0 : a0), //r1= (b1 < 0) ? -a1 : ((b1 == 0) ? 0 : a1) extern __m64 _mm_sign_pi32 (__m64 a, __m64 b); /*Packed align and shift right by n*8 bits, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //n: A constant that specifies how many bytes the interim result will be //shifted to the right, If n > 32, the result value is zero //CONCAT(a, b) is the 256-bit unsigned intermediate value that is a concatenation of //parameters a and b. The result is this intermediate value shifted right by n bytes. //则r= (CONCAT(a, b) >> (n * 8)) & 0xffffffffffffffff extern __m128i _mm_alignr_epi8 (__m128i a, __m128i b, int n); //n: An integer constant that specifies how many bytes to shift the interim //result to the right,If n > 16, the result value is zero //CONCAT(a, b) is the 128-bit unsigned intermediate value that is formed by //concatenating parameters a and b. The result value is the rightmost 64 bits after //shifting this intermediate result right by n bytes //则r = (CONCAT(a, b) >> (n * 8)) & 0xffffffff extern __m64 _mm_alignr_pi8 (__m64 a, __m64 b, int n); /*Packed byte, word, double word absolute value, {X,}MM2/m{128,64} (b) to {X,}MM1 (a).*/ //a=(a0, a1, a2, ..., a13, a14, a15) //则r0 = (a0 < 0) ? -a0 : a0, ..., r15 = (a15 < 0) ? -a15 : a15 extern __m128i _mm_abs_epi8 (__m128i a); //a=(a0, a1, a2, a3, a4, a5, a6, a7) //则r0 = (a0 < 0) ? -a0 : a0, ..., r7 = (a7 < 0) ? -a7 : a7 extern __m128i _mm_abs_epi16 (__m128i a); //a=(a0, a1, a2, a3) //则r0 = (a0 < 0) ? -a0 : a0, ..., r3 = (a3 < 0) ? -a3 : a3 extern __m128i _mm_abs_epi32 (__m128i a); //a=(a0, a1, a2, a3, a4, a5, a6, a7) //则r0 = (a0 < 0) ? -a0 : a0, ..., r7 = (a7 < 0) ? -a7 : a7 extern __m64 _mm_abs_pi8 (__m64 a); //a=(a0, a1, a2, a3) //则r0 = (a0 < 0) ? -a0 : a0, ..., r3 = (a3 < 0) ? -a3 : a3 extern __m64 _mm_abs_pi16 (__m64 a); //a=(a0, a1), 则r0 = (a0 < 0) ? -a0 : a0, r1 = (a1 < 0) ? -a1 : a1 extern __m64 _mm_abs_pi32 (__m64 a);
相关推荐
微软不知出没出支持SSE3、SSSE3、SSE4.1指令集的汇编编译器,我用的ml.exe版本是6.15.8803是不支持。因为实际要用到这些指令,所以写了个宏,能在源程序中自由用这些指令。对喜欢用汇编,用SSE3、SSSE3、SSE4.1指令...
使用SIMD的C ++图像处理库:SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AVX2,AVX-512,VMX(Altivec)和VSX(Power7),NEON for ARM。 简介Simd库是一个免费的开源图像处理和机器学习库,专为C和C ++程序员...
Simd:使用以下SIMD的C ++图像处理和机器学习库:SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,AVX,AVX2,AVX-512,VMX(Altivec)和VSX(Power7),NEON臂
软件介绍: c-play v2.0b39官网最新版,这是一款非常不错的播放器,适当设置一下,...最新的Intel CPU(酷睿双核处理器等)支持SSSE3和SSE4.1。具体请使用CPU-Z来确定CPU指令集支持(SSE2,SSE3,SSSE3,SSE4)。
3 应用编程全面而完整:既有通用编程,又有利用X86处理器的浮点和数学运算编程以及利用MMX技术和SSE2 SSE3,SSSE3的多媒体和科学计算应用编程。 4 本书包括了应用编程的详尽内容,也包含了系统编程的主要内容,具有...
3 应用编程全面而完整:既有通用编程,又有利用X86处理器的浮点和数学运算编程以及利用MMX技术和SSE2 SSE3,SSSE3的多媒体和科学计算应用编程。 4 本书包括了应用编程的详尽内容,也包含了系统编程的主要内容,具有...
Support for most SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AES, AVX, and AVX2 intrinsics (X64 only). Integrated source code editor with call tips and symbol browsing. Integrated source-level debugger. ...
(MMX, SSE, SSE2, SSE3, SSSE3 andSSE4)Priya Periaswamyy yComputer Architecture (CSE5302)Overview• MMX (MultiMedia eXtention) Architecture MMX I t ti• MMX Instructions• SSE (Streaming SIMD...
SSSE32动态库说明.pdf,详细介绍了SSSE32的调方接口及入参和出参。
适用于版本为6.15.8803汇编编译器ml.exe; 定义Unicode字符串的宏; 编译SSE3、SSSE3、SSE4.1指令的宏; 定义接口和调用接口中方法的宏。
个人整理的x86汇编指令列表,包括的指令集有:FPU, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, VMX, CLMUL, AES
社保卡二次开发库,主要用于社会保障卡系统的软件的二次开发
With this plugin, you can now debug FPU, MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and VMX without problems. OllyDBG里有些特殊指令,默認的反匯編引擎無法識別.. 用這個插件就可以搞定啦..下面是例子: ...
它对应于“ arm_neon.h”头文件中定义的ARM NEON内部函数和对应的x86编译器头文件中定义的x86 SSE(直到SSE4.2)内部函数的对应关系(或实际移植)。 要利用此文件,只需将其包含在使用ARM NEON intinsics而不是“ ...
C image processing library with using of SIMD: SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX-512, VMX(Altivec) and VSX(Power7), NEON for ARM.
Xbyak 5.991; C ++的用于x86(IA32),x64(AMD64,x86-64)的JIT汇编器 抽象的 Xbyak是一个C ++头文件库,可动态组装x86(IA32),x64(AMD64,x86-64)助记符。 Xbyak的发音是kəi-bja-k 。 它是用日语单词“ 来...
FCML 这是用于IA-32和Intel 64体系结构的通用机器代码处理库。 该库支持类似UNIX的系统以及Windows,...,SSE,SSE2,SSE3,SSSE3,SSE4.1,SSE4.2,SSE4A,AVX,AVX2,AES,TBM,BMI1,BMI2,HLE,ADX,CLMUL,RDRA
cPlay_2_0b39_sse2支持WAV、FLAC、CUE,不支持APE,可修改cMP配置文件而调用foobar 2000等其他播放程序
特里顿Triton是软件包管理器的软件包集合。 linux发行版源代码位于nixos/文件夹中。讨论频道社区: +triton:matrix.org旧版文档支持的... SSE3 , SSSE3 , SSE4 , SSE4.1 , SSE4.2 , AES (又名至少是Intel Westme
详解段错误的定义及列举最常用的调试方法