Difference between revisions of "Tests of a SIMD version of DVector3"
From GlueXWiki
Line 22: | Line 22: | ||
make orthogonal v 0.142 0.273 1.92 | make orthogonal v 0.142 0.273 1.92 | ||
v3 = v1 x v2 0.107 0.193 1.80 | v3 = v1 x v2 0.107 0.193 1.80 | ||
+ | </pre> | ||
+ | |||
+ | * The basic unit is a union: | ||
+ | |||
+ | <pre> | ||
+ | union dvec{ | ||
+ | __m128d v; | ||
+ | double d[2]; | ||
+ | }; | ||
+ | </pre> | ||
+ | |||
+ | * A simple example: | ||
+ | |||
+ | <pre> | ||
+ | // Add two vectors | ||
+ | inline DVector3 operator+(const DVector3 &v1,const DVector3 &v2){ | ||
+ | return DVector3(_mm_add_pd(v1.GetVxy(),v2.GetVxy()), | ||
+ | _mm_add_pd(v1.GetVzx(),v2.GetVzx())); | ||
+ | }; | ||
+ | </pre> | ||
+ | |||
+ | * A more complicated example: | ||
+ | |||
+ | <pre> | ||
+ | |||
+ | // Rotate by angle a about the axis specified by v. The following ugly mess | ||
+ | // of SIMD instructions does the following: | ||
+ | // sa = sin(a), ca= cos(a) | ||
+ | // dx=vx/|v|, dy=vy/|v|, dz=vz/|v| | ||
+ | // |x'| |ca+(1-ca)*dx*dx (1-ca)*dx*dy-sa*dz (1-ca)*dx*dz+sa*dy||x| | ||
+ | // |y'|=| (1-ca)*dy*dx+sa*dz ca+(1-ca)*dy*dy (1-ca)*dy*dz-sa*dx||y| | ||
+ | // |z'| | (1-ca)*dz*dx-sa*dy (1-ca)*dz*dy+sa*dx ca+(1-ca)*dz*dz ||z| | ||
+ | // | ||
+ | DVector3 Rotate(const double a,const DVector3 &v){ | ||
+ | if (a!=0){ | ||
+ | double vmag=v.Mag(); | ||
+ | if (vmag==0.0){ | ||
+ | //cerr << "axis vector has zero magnitude." <<endl; | ||
+ | } | ||
+ | else{ | ||
+ | __m128d xx=_mm_set1_pd(x()); | ||
+ | __m128d yy=_mm_set1_pd(y()); | ||
+ | __m128d zz=_mm_set1_pd(z()); | ||
+ | double sa=sin(a); | ||
+ | double ca=cos(a); | ||
+ | __m128d one_minus_ca=_mm_set1_pd(1.-ca); | ||
+ | __m128d sa_zero=_mm_setr_pd(sa,0.); | ||
+ | __m128d ca_zero=_mm_setr_pd(ca,0.); | ||
+ | __m128d zero_sa=_mm_setr_pd(0.,sa); | ||
+ | __m128d zero_ca=_mm_setr_pd(0.,ca); | ||
+ | __m128d scale=_mm_set1_pd(1./vmag); | ||
+ | __m128d dxdy=_mm_mul_pd(scale,v.GetVxy()); | ||
+ | __m128d dzdx=_mm_mul_pd(scale,v.GetVzx()); | ||
+ | __m128d dxdx=_mm_mul_pd(scale,_mm_set1_pd(v.x())); | ||
+ | __m128d dydy=_mm_mul_pd(scale,_mm_set1_pd(v.y())); | ||
+ | __m128d dzdz=_mm_mul_pd(scale,_mm_set1_pd(v.z())); | ||
+ | __m128d AD=_mm_add_pd(_mm_add_pd(ca_zero, | ||
+ | _mm_mul_pd(one_minus_ca, | ||
+ | _mm_mul_pd(dxdy,dxdx))), | ||
+ | _mm_mul_pd(zero_sa,dzdz)); | ||
+ | __m128d BE=_mm_sub_pd(_mm_add_pd(zero_ca, | ||
+ | _mm_mul_pd(one_minus_ca, | ||
+ | _mm_mul_pd(dxdy,dydy))), | ||
+ | _mm_mul_pd(sa_zero,dzdz)); | ||
+ | __m128d CF=_mm_sub_pd(_mm_add_pd(_mm_mul_pd(one_minus_ca, | ||
+ | _mm_mul_pd(dxdy,dzdz)), | ||
+ | _mm_mul_pd(sa_zero,dydy)), | ||
+ | _mm_mul_pd(zero_sa,dxdx)); | ||
+ | __m128d IA=_mm_sub_pd(_mm_add_pd(zero_ca, | ||
+ | _mm_mul_pd(one_minus_ca, | ||
+ | _mm_mul_pd(dzdx,dxdx))), | ||
+ | _mm_mul_pd(sa_zero,dydy)); | ||
+ | __m128d JB=_mm_sub_pd(_mm_add_pd(_mm_mul_pd(sa_zero,dxdx), | ||
+ | _mm_mul_pd(one_minus_ca, | ||
+ | _mm_mul_pd(dzdx,dydy))), | ||
+ | _mm_mul_pd(zero_sa,dzdz)); | ||
+ | __m128d KC=_mm_add_pd(_mm_add_pd(ca_zero, | ||
+ | _mm_mul_pd(one_minus_ca, | ||
+ | _mm_mul_pd(dzdx,dzdz))), | ||
+ | _mm_mul_pd(zero_sa,dydy)); | ||
+ | |||
+ | xy.v=_mm_add_pd(_mm_add_pd(_mm_mul_pd(AD,xx), _mm_mul_pd(BE,yy)), | ||
+ | _mm_mul_pd(CF,zz)); | ||
+ | zx.v=_mm_add_pd(_mm_add_pd(_mm_mul_pd(IA,xx), _mm_mul_pd(JB,yy)), | ||
+ | _mm_mul_pd(KC,zz)); | ||
+ | yz.v=_mm_setr_pd(y(),z()); | ||
+ | |||
+ | } | ||
+ | } | ||
+ | return *this; | ||
+ | } | ||
</pre> | </pre> |
Latest revision as of 17:45, 4 May 2010
- SIMD (Single-Instruction Multiple-Data) instructions provide means of doing multiple calculations with or manipulations of several pieces of data with a single operation
- Size of special registers is 128 bits: can do 4 single-precision floating point calculations (SSE) or 2 double-precision floating point calculations (SSE2) at the same time
- Basic arithmetic and logical operations are supported, as are square roots
- Implemented DVector3 with SSE2 instructions
- Three pairs of coordinates: xy,yz,zx
- gcc compiler flags: -msse2 -mfpmath=sse -O2
- Compare to TVector3 on ifarml6: 10 million events with randomly generated vectors
Operation DVector3 time (s) TVector3 time (s) T3/D3 ----------------------------------------------------------------- v2=-v1 0.054 0.171 3.17 v3=v1+v2 0.092 0.191 2.08 v3+=v1 0.056 0.048 0.86 <- v3=v1-v2 0.092 0.194 2.11 v2=k v1 0.054 0.174 3.24 v2*=k 0.030 0.018 0.60 <- SetMag 0.198 0.198 1.00 SetXYZ 0.007 0.008 1.14 SetMagThetaPhi 0.680 0.890 1.31 Expensive! Rotate(a,v) 0.625 1.528 2.44 " RotateZ(a) 0.353 0.402 1.14 make orthogonal v 0.142 0.273 1.92 v3 = v1 x v2 0.107 0.193 1.80
- The basic unit is a union:
union dvec{ __m128d v; double d[2]; };
- A simple example:
// Add two vectors inline DVector3 operator+(const DVector3 &v1,const DVector3 &v2){ return DVector3(_mm_add_pd(v1.GetVxy(),v2.GetVxy()), _mm_add_pd(v1.GetVzx(),v2.GetVzx())); };
- A more complicated example:
// Rotate by angle a about the axis specified by v. The following ugly mess // of SIMD instructions does the following: // sa = sin(a), ca= cos(a) // dx=vx/|v|, dy=vy/|v|, dz=vz/|v| // |x'| |ca+(1-ca)*dx*dx (1-ca)*dx*dy-sa*dz (1-ca)*dx*dz+sa*dy||x| // |y'|=| (1-ca)*dy*dx+sa*dz ca+(1-ca)*dy*dy (1-ca)*dy*dz-sa*dx||y| // |z'| | (1-ca)*dz*dx-sa*dy (1-ca)*dz*dy+sa*dx ca+(1-ca)*dz*dz ||z| // DVector3 Rotate(const double a,const DVector3 &v){ if (a!=0){ double vmag=v.Mag(); if (vmag==0.0){ //cerr << "axis vector has zero magnitude." <<endl; } else{ __m128d xx=_mm_set1_pd(x()); __m128d yy=_mm_set1_pd(y()); __m128d zz=_mm_set1_pd(z()); double sa=sin(a); double ca=cos(a); __m128d one_minus_ca=_mm_set1_pd(1.-ca); __m128d sa_zero=_mm_setr_pd(sa,0.); __m128d ca_zero=_mm_setr_pd(ca,0.); __m128d zero_sa=_mm_setr_pd(0.,sa); __m128d zero_ca=_mm_setr_pd(0.,ca); __m128d scale=_mm_set1_pd(1./vmag); __m128d dxdy=_mm_mul_pd(scale,v.GetVxy()); __m128d dzdx=_mm_mul_pd(scale,v.GetVzx()); __m128d dxdx=_mm_mul_pd(scale,_mm_set1_pd(v.x())); __m128d dydy=_mm_mul_pd(scale,_mm_set1_pd(v.y())); __m128d dzdz=_mm_mul_pd(scale,_mm_set1_pd(v.z())); __m128d AD=_mm_add_pd(_mm_add_pd(ca_zero, _mm_mul_pd(one_minus_ca, _mm_mul_pd(dxdy,dxdx))), _mm_mul_pd(zero_sa,dzdz)); __m128d BE=_mm_sub_pd(_mm_add_pd(zero_ca, _mm_mul_pd(one_minus_ca, _mm_mul_pd(dxdy,dydy))), _mm_mul_pd(sa_zero,dzdz)); __m128d CF=_mm_sub_pd(_mm_add_pd(_mm_mul_pd(one_minus_ca, _mm_mul_pd(dxdy,dzdz)), _mm_mul_pd(sa_zero,dydy)), _mm_mul_pd(zero_sa,dxdx)); __m128d IA=_mm_sub_pd(_mm_add_pd(zero_ca, _mm_mul_pd(one_minus_ca, _mm_mul_pd(dzdx,dxdx))), _mm_mul_pd(sa_zero,dydy)); __m128d JB=_mm_sub_pd(_mm_add_pd(_mm_mul_pd(sa_zero,dxdx), _mm_mul_pd(one_minus_ca, _mm_mul_pd(dzdx,dydy))), _mm_mul_pd(zero_sa,dzdz)); __m128d KC=_mm_add_pd(_mm_add_pd(ca_zero, _mm_mul_pd(one_minus_ca, _mm_mul_pd(dzdx,dzdz))), _mm_mul_pd(zero_sa,dydy)); xy.v=_mm_add_pd(_mm_add_pd(_mm_mul_pd(AD,xx), _mm_mul_pd(BE,yy)), _mm_mul_pd(CF,zz)); zx.v=_mm_add_pd(_mm_add_pd(_mm_mul_pd(IA,xx), _mm_mul_pd(JB,yy)), _mm_mul_pd(KC,zz)); yz.v=_mm_setr_pd(y(),z()); } } return *this; }