10 #include <immintrin.h>
22 for(
int n=0;
n<v.size();
n++) _mm_free(v[
n]);
23 v.clear(); std::vector<float*>().swap(v);
26 float x[4]; _mm_storeu_ps(x,y);
27 return x[0]+x[1]+x[2]+x[3];
31 float** u,
float **
v,
int I) {
33 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
34 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
35 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
36 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
37 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
38 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
39 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
40 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
42 NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
43 __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
44 __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
45 __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
46 __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
47 __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
48 __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
49 __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
51 for(
int i=0;
i<I;
i+=4) {
52 NETX(*_u0++ = *_p0++; *_v0++ = *_q0++; ,
53 *_u1++ = *_p1++; *_v1++ = *_q1++; ,
54 *_u2++ = *_p2++; *_v2++ = *_q2++; ,
55 *_u3++ = *_p3++; *_v3++ = *_q3++; ,
56 *_u4++ = *_p4++; *_v4++ = *_q4++; ,
57 *_u5++ = *_p5++; *_v5++ = *_q5++; ,
58 *_u6++ = *_p6++; *_v6++ = *_q6++; ,
59 *_u7++ = *_p7++; *_v7++ = *_q7++; )
66 std::vector<float*> &pAPN,
67 std::vector<float*> &pAVX,
int I) {
75 __m128* _fp = (__m128*)pAVX[2];
76 __m128* _fx = (__m128*)pAVX[3];
77 __m128* _si = (__m128*)pAVX[4];
78 __m128* _co = (__m128*)pAVX[5];
79 __m128* _ni = (__m128*)pAVX[18];
80 __m128 _ff,_FF,_fF,_AP,_cc,_ss,_nn,_cn,_sn;
81 static const __m128 _0 = _mm_set1_ps(0);
82 static const __m128 _1 = _mm_set1_ps(1);
83 static const __m128 _2 = _mm_set1_ps(2);
84 static const __m128 _o = _mm_set1_ps(0.0001);
85 static const __m128
sm = _mm_set1_ps(-0.
f);
86 __m128 _NI= _mm_setzero_ps();
87 __m128 _NN= _mm_setzero_ps();
89 NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I); __m128* _r0=(__m128*)(pAPN[0]+2*I);,
90 __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I); __m128* _r1=(__m128*)(pAPN[1]+2*I);,
91 __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I); __m128* _r2=(__m128*)(pAPN[2]+2*I);,
92 __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I); __m128* _r3=(__m128*)(pAPN[3]+2*I);,
93 __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I); __m128* _r4=(__m128*)(pAPN[4]+2*I);,
94 __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I); __m128* _r5=(__m128*)(pAPN[5]+2*I);,
95 __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I); __m128* _r6=(__m128*)(pAPN[6]+2*I);,
96 __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I); __m128* _r7=(__m128*)(pAPN[7]+2*I);)
98 NETX(__m128 _Fp0 = _mm_set1_ps(*(Fp[0]+l)); __m128 _Fx0 = _mm_set1_ps(*(Fx[0]+l)); ,
99 __m128 _Fp1 = _mm_set1_ps(*(Fp[1]+l)); __m128 _Fx1 = _mm_set1_ps(*(Fx[1]+l)); ,
100 __m128 _Fp2 = _mm_set1_ps(*(Fp[2]+l)); __m128 _Fx2 = _mm_set1_ps(*(Fx[2]+l)); ,
101 __m128 _Fp3 = _mm_set1_ps(*(Fp[3]+l)); __m128 _Fx3 = _mm_set1_ps(*(Fx[3]+l)); ,
102 __m128 _Fp4 = _mm_set1_ps(*(Fp[4]+l)); __m128 _Fx4 = _mm_set1_ps(*(Fx[4]+l)); ,
103 __m128 _Fp5 = _mm_set1_ps(*(Fp[5]+l)); __m128 _Fx5 = _mm_set1_ps(*(Fx[5]+l)); ,
104 __m128 _Fp6 = _mm_set1_ps(*(Fp[6]+l)); __m128 _Fx6 = _mm_set1_ps(*(Fx[6]+l)); ,
105 __m128 _Fp7 = _mm_set1_ps(*(Fp[7]+l)); __m128 _Fx7 = _mm_set1_ps(*(Fx[7]+l)); )
108 NETX(
if(*(pAPN[0]+2*I)>0.) sign+=*(Fp[0]+l) * *(Fx[0]+l);,
109 if(*(pAPN[1]+2*I)>0.) sign+=*(Fp[1]+l) * *(Fx[1]+l);,
110 if(*(pAPN[2]+2*I)>0.) sign+=*(Fp[2]+l) * *(Fx[2]+l);,
111 if(*(pAPN[3]+2*I)>0.) sign+=*(Fp[3]+l) * *(Fx[3]+l);,
112 if(*(pAPN[4]+2*I)>0.) sign+=*(Fp[4]+l) * *(Fx[4]+l);,
113 if(*(pAPN[5]+2*I)>0.) sign+=*(Fp[5]+l) * *(Fx[5]+l);,
114 if(*(pAPN[6]+2*I)>0.) sign+=*(Fp[6]+l) * *(Fx[6]+l);,
115 if(*(pAPN[7]+2*I)>0.) sign+=*(Fp[7]+l) * *(Fx[7]+l);)
116 sign = sign>0 ? 1. : -1.;
117 __m128 _sign = _mm_set1_ps(sign);
119 for(
int i=0;
i<I;
i+=4) {
121 *_f0 = _mm_mul_ps(*_r0,_Fp0); *_F0 = _mm_mul_ps(*_r0++,_Fx0);
122 _ff = _mm_mul_ps(*_f0,*_f0);
123 _FF = _mm_mul_ps(*_F0,*_F0);
124 _fF = _mm_mul_ps(*_f0,*_F0); ,
126 *_f1 = _mm_mul_ps(*_r1,_Fp1); *_F1 = _mm_mul_ps(*_r1++,_Fx1);
127 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f1,*_f1));
128 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F1,*_F1));
129 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f1,*_F1)); ,
131 *_f2 = _mm_mul_ps(*_r2,_Fp2); *_F2 = _mm_mul_ps(*_r2++,_Fx2);
132 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f2,*_f2));
133 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F2,*_F2));
134 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f2,*_F2)); ,
136 *_f3 = _mm_mul_ps(*_r3,_Fp3); *_F3 = _mm_mul_ps(*_r3++,_Fx3);
137 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f3,*_f3));
138 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F3,*_F3));
139 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f3,*_F3)); ,
141 *_f4 = _mm_mul_ps(*_r4,_Fp4); *_F4 = _mm_mul_ps(*_r4++,_Fx4);
142 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f4,*_f4));
143 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F4,*_F4));
144 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f4,*_F4)); ,
146 *_f5 = _mm_mul_ps(*_r5,_Fp5); *_F5 = _mm_mul_ps(*_r5++,_Fx5);
147 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f5,*_f5));
148 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F5,*_F5));
149 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f5,*_F5)); ,
151 *_f6 = _mm_mul_ps(*_r6,_Fp6); *_F6 = _mm_mul_ps(*_r6++,_Fx6);
152 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f6,*_f6));
153 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F6,*_F6));
154 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f6,*_F6)); ,
156 *_f7 = _mm_mul_ps(*_r7,_Fp7); *_F7 = _mm_mul_ps(*_r7++,_Fx7);
157 _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f7,*_f7));
158 _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F7,*_F7));
159 _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f7,*_F7)); )
161 *_si = _mm_mul_ps(_fF,_2);
162 *_co = _mm_sub_ps(_ff,_FF);
163 _AP = _mm_add_ps(_ff,_FF);
164 _cc = _mm_mul_ps(*_co,*_co);
165 _ss = _mm_mul_ps(*_si,*_si);
166 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
167 *_fp = _mm_div_ps(_mm_add_ps(_AP,_nn),_2);
168 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
169 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
170 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
171 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
172 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
173 *_co = _mm_mul_ps(*_co,_ss);
177 _ff = _mm_add_ps(_mm_mul_ps(*_f0,*_co),_mm_mul_ps(*_F0,*_si)); _cc = _mm_mul_ps(_ff,_ff);
178 _FF = _mm_sub_ps(_mm_mul_ps(*_F0,*_co),_mm_mul_ps(*_f0,*_si)); *_f0 = _ff; *_F0 = _FF;
179 _nn = _mm_mul_ps(_cc,_cc); _fF = _mm_mul_ps(_ff,_FF);,
181 _ff = _mm_add_ps(_mm_mul_ps(*_f1,*_co),_mm_mul_ps(*_F1,*_si)); _cc = _mm_mul_ps(_ff,_ff);
182 _FF = _mm_sub_ps(_mm_mul_ps(*_F1,*_co),_mm_mul_ps(*_f1,*_si)); *_f1 = _ff; *_F1 = _FF;
183 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
185 _ff = _mm_add_ps(_mm_mul_ps(*_f2,*_co),_mm_mul_ps(*_F2,*_si)); _cc = _mm_mul_ps(_ff,_ff);
186 _FF = _mm_sub_ps(_mm_mul_ps(*_F2,*_co),_mm_mul_ps(*_f2,*_si)); *_f2 = _ff; *_F2 = _FF;
187 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
189 _ff = _mm_add_ps(_mm_mul_ps(*_f3,*_co),_mm_mul_ps(*_F3,*_si)); _cc = _mm_mul_ps(_ff,_ff);
190 _FF = _mm_sub_ps(_mm_mul_ps(*_F3,*_co),_mm_mul_ps(*_f3,*_si)); *_f3 = _ff; *_F3 = _FF;
191 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
193 _ff = _mm_add_ps(_mm_mul_ps(*_f4,*_co),_mm_mul_ps(*_F4,*_si)); _cc = _mm_mul_ps(_ff,_ff);
194 _FF = _mm_sub_ps(_mm_mul_ps(*_F4,*_co),_mm_mul_ps(*_f4,*_si)); *_f4 = _ff; *_F4 = _FF;
195 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
197 _ff = _mm_add_ps(_mm_mul_ps(*_f5,*_co),_mm_mul_ps(*_F5,*_si)); _cc = _mm_mul_ps(_ff,_ff);
198 _FF = _mm_sub_ps(_mm_mul_ps(*_F5,*_co),_mm_mul_ps(*_f5,*_si)); *_f5 = _ff; *_F5 = _FF;
199 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
201 _ff = _mm_add_ps(_mm_mul_ps(*_f6,*_co),_mm_mul_ps(*_F6,*_si)); _cc = _mm_mul_ps(_ff,_ff);
202 _FF = _mm_sub_ps(_mm_mul_ps(*_F6,*_co),_mm_mul_ps(*_f6,*_si)); *_f6 = _ff; *_F6 = _FF;
203 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));,
205 _ff = _mm_add_ps(_mm_mul_ps(*_f7,*_co),_mm_mul_ps(*_F7,*_si)); _cc = _mm_mul_ps(_ff,_ff);
206 _FF = _mm_sub_ps(_mm_mul_ps(*_F7,*_co),_mm_mul_ps(*_f7,*_si)); *_f7 = _ff; *_F7 = _FF;
207 _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));)
209 _fF = _mm_div_ps(_fF,_mm_add_ps(*_fp,_o));
212 *_F0 = _mm_sub_ps(*_F0,_mm_mul_ps(*_f0++,_fF)); *_fx = _mm_mul_ps(*_F0,*_F0); _F0++;,
213 *_F1 = _mm_sub_ps(*_F1,_mm_mul_ps(*_f1++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F1,*_F1)); _F1++;,
214 *_F2 = _mm_sub_ps(*_F2,_mm_mul_ps(*_f2++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F2,*_F2)); _F2++;,
215 *_F3 = _mm_sub_ps(*_F3,_mm_mul_ps(*_f3++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F3,*_F3)); _F3++;,
216 *_F4 = _mm_sub_ps(*_F4,_mm_mul_ps(*_f4++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F4,*_F4)); _F4++;,
217 *_F5 = _mm_sub_ps(*_F5,_mm_mul_ps(*_f5++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F5,*_F5)); _F5++;,
218 *_F6 = _mm_sub_ps(*_F6,_mm_mul_ps(*_f6++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F6,*_F6)); _F6++;,
219 *_F7 = _mm_sub_ps(*_F7,_mm_mul_ps(*_f7++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F7,*_F7)); _F7++;)
221 *_ni = _mm_div_ps(_nn,_mm_add_ps(_mm_mul_ps(*_fp,*_fp),_o));
222 _ff = _mm_add_ps(_mm_mul_ps(_1,*_ni),_o);
223 _NI = _mm_add_ps(_NI,(_mm_div_ps(*_fx,_ff)));
224 _NN = _mm_add_ps(_NN,_mm_and_ps(_mm_cmpgt_ps(*_fp,_0),_1));
226 _fp++; _fx++; _si++; _co++; _ni++;
233 std::vector<float*> &pAPN,
float* rr,
234 std::vector<float*> &pAVX,
int II) {
244 __m128* _et = (__m128*)pAVX[0];
245 __m128* _mk = (__m128*)pAVX[1];
246 __m128* _fp = (__m128*)pAVX[2];
247 __m128* _fx = (__m128*)pAVX[3];
248 __m128* _au = (__m128*)pAVX[10];
249 __m128* _AU = (__m128*)pAVX[11];
250 __m128* _av = (__m128*)pAVX[12];
251 __m128* _AV = (__m128*)pAVX[13];
252 __m128* _ni = (__m128*)pAVX[18];
254 __m128 _uu = _mm_setzero_ps();
255 __m128 _UU = _mm_setzero_ps();
256 __m128 _uU = _mm_setzero_ps();
257 __m128 _vv = _mm_setzero_ps();
258 __m128 _VV = _mm_setzero_ps();
259 __m128 _vV = _mm_setzero_ps();
260 __m128 _NN = _mm_setzero_ps();
262 __m128 _h,_H,_f,_F,_R,_a,_nn,_m,_ff,_xp,_XP,_xx,_XX;
263 float cu,su,cv,sv,cc,
ss,nn,uu,UU,vv,VV,et,ET;
264 static const __m128 _0 = _mm_set1_ps(0);
265 static const __m128 _1 = _mm_set1_ps(1);
266 static const __m128 o1 = _mm_set1_ps(0.1);
267 static const __m128 _o = _mm_set1_ps(1.
e-5);
268 __m128 _rr = _mm_set1_ps(rr[0]);
269 __m128 _RR = _mm_set1_ps(rr[1]);
272 NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I);,
273 __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I);,
274 __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I);,
275 __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I);,
276 __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I);,
277 __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I);,
278 __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I);,
279 __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I);)
282 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0];,
283 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1];,
284 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2];,
285 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3];,
286 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4];,
287 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5];,
288 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6];,
289 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7];)
291 for(
int i=0;
i<I;
i+=4) {
294 _xp = _mm_mul_ps(*_f0,*_p0);
295 _XP = _mm_mul_ps(*_f0,*_q0);
296 _xx = _mm_mul_ps(*_F0,*_p0);
297 _XX = _mm_mul_ps(*_F0,*_q0); ,
299 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f1,*_p1));
300 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f1,*_q1));
301 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F1,*_p1));
302 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F1,*_q1)); ,
304 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f2,*_p2));
305 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f2,*_q2));
306 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F2,*_p2));
307 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F2,*_q2)); ,
309 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f3,*_p3));
310 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f3,*_q3));
311 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F3,*_p3));
312 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F3,*_q3)); ,
314 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f4,*_p4));
315 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f4,*_q4));
316 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F4,*_p4));
317 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F4,*_q4)); ,
319 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f5,*_p5));
320 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f5,*_q5));
321 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F5,*_p5));
322 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F5,*_q5)); ,
324 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f6,*_p6));
325 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f6,*_q6));
326 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F6,*_p6));
327 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F6,*_q6)); ,
329 _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f7,*_p7));
330 _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f7,*_q7));
331 _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F7,*_p7));
332 _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F7,*_q7)); )
336 _f = _mm_add_ps(_mm_mul_ps(_xp,_xp),_mm_mul_ps(_XP,_XP));
337 _f = _mm_div_ps(_f,_mm_add_ps(*_et,_o));
338 _f = _mm_sqrt_ps(_mm_mul_ps(*_ni,_f));
339 _f = _mm_sub_ps(_mm_mul_ps(_f,_rr),*_fp);
340 _f = _mm_mul_ps(_f,_mm_and_ps(_mm_cmpgt_ps(_f,_0),_1));
341 _f = _mm_div_ps(*_mk,_mm_add_ps(*_fp,_mm_add_ps(_o,_f)));
343 _h = _mm_mul_ps(_xp,_f);
344 _H = _mm_mul_ps(_XP,_f);
345 _h = _mm_add_ps(_mm_mul_ps(_h,_h),_mm_mul_ps(_H,_H));
346 _H = _mm_add_ps(_mm_mul_ps(_xx,_xx),_mm_mul_ps(_XX,_XX));
347 _F = _mm_sqrt_ps(_mm_div_ps(_H,_mm_add_ps(_h,_o)));
348 _R = _mm_add_ps(o1,_mm_div_ps(_RR,_mm_add_ps(*_et,_o)));
349 _F = _mm_sub_ps(_mm_mul_ps(_F,_R),*_fx);
350 _F = _mm_mul_ps(_F,_mm_and_ps(_mm_cmpgt_ps(_F,_0),_1));
351 _F = _mm_div_ps(*_mk,_mm_add_ps(*_fx,_mm_add_ps(_o,_F)));
353 *_au = _mm_mul_ps(_xp,_f);
354 *_AU = _mm_mul_ps(_XP,_f);
355 *_av = _mm_mul_ps(_xx,_F);
356 *_AV = _mm_mul_ps(_XX,_F);
358 _a = _mm_add_ps(_mm_mul_ps(_f,*_fp),_mm_mul_ps(_F,*_fx));
359 _NN = _mm_add_ps(_NN,*_mk);
360 *_mk = _mm_add_ps(_a,_mm_sub_ps(*_mk,_1));
362 _mk++; _et++; _fp++; _fx++; _ni++;
365 NETX(*_p0++ = _mm_add_ps(_mm_mul_ps(*_f0, *_au),_mm_mul_ps(*_F0 ,*_av));
366 *_q0++ = _mm_add_ps(_mm_mul_ps(*_f0++,*_AU),_mm_mul_ps(*_F0++,*_AV)); ,
367 *_p1++ = _mm_add_ps(_mm_mul_ps(*_f1 ,*_au),_mm_mul_ps(*_F1 ,*_av));
368 *_q1++ = _mm_add_ps(_mm_mul_ps(*_f1++,*_AU),_mm_mul_ps(*_F1++,*_AV)); ,
369 *_p2++ = _mm_add_ps(_mm_mul_ps(*_f2 ,*_au),_mm_mul_ps(*_F2 ,*_av));
370 *_q2++ = _mm_add_ps(_mm_mul_ps(*_f2++,*_AU),_mm_mul_ps(*_F2++,*_AV)); ,
371 *_p3++ = _mm_add_ps(_mm_mul_ps(*_f3 ,*_au),_mm_mul_ps(*_F3 ,*_av));
372 *_q3++ = _mm_add_ps(_mm_mul_ps(*_f3++,*_AU),_mm_mul_ps(*_F3++,*_AV)); ,
373 *_p4++ = _mm_add_ps(_mm_mul_ps(*_f4 ,*_au),_mm_mul_ps(*_F4 ,*_av));
374 *_q4++ = _mm_add_ps(_mm_mul_ps(*_f4++,*_AU),_mm_mul_ps(*_F4++,*_AV)); ,
375 *_p5++ = _mm_add_ps(_mm_mul_ps(*_f5 ,*_au),_mm_mul_ps(*_F5 ,*_av));
376 *_q5++ = _mm_add_ps(_mm_mul_ps(*_f5++,*_AU),_mm_mul_ps(*_F5++,*_AV)); ,
377 *_p6++ = _mm_add_ps(_mm_mul_ps(*_f6 ,*_au),_mm_mul_ps(*_F6 ,*_av));
378 *_q6++ = _mm_add_ps(_mm_mul_ps(*_f6++,*_AU),_mm_mul_ps(*_F6++,*_AV)); ,
379 *_p7++ = _mm_add_ps(_mm_mul_ps(*_f7 ,*_au),_mm_mul_ps(*_F7 ,*_av));
380 *_q7++ = _mm_add_ps(_mm_mul_ps(*_f7++,*_AU),_mm_mul_ps(*_F7++,*_AV)); )
382 _uU = _mm_add_ps(_uU,_mm_mul_ps(*_au,*_AU));
383 _vV = _mm_add_ps(_vV,_mm_mul_ps(*_av,*_AV));
384 _uu = _mm_add_ps(_uu,_mm_mul_ps(*_au,*_au)); _au++;
385 _UU = _mm_add_ps(_UU,_mm_mul_ps(*_AU,*_AU)); _AU++;
386 _vv = _mm_add_ps(_vv,_mm_mul_ps(*_av,*_av)); _av++;
387 _VV = _mm_add_ps(_VV,_mm_mul_ps(*_AV,*_AV)); _AV++;
394 nn = sqrt((uu-UU)*(uu-UU)+4*su*su)+0.0001;
395 cu = (uu-UU)/nn; et=uu+UU;
396 uu = sqrt((et+nn)/2); UU = et>nn?sqrt((et-nn)/2):0;
398 su = sqrt((1-cu)/2); cu = nn*sqrt((1+cu)/2);
401 nn = sqrt((vv-VV)*(vv-VV)+4*sv*sv)+0.0001;
402 cv = (vv-VV)/nn; ET=vv+VV;
403 vv = sqrt((ET+nn)/2); VV = et>nn?sqrt((ET-nn)/2):0;
405 sv = sqrt((1-cv)/2); cv = nn*sqrt((1+cv)/2);
413 float** u,
float** v,
float En,
414 std::vector<float*> &pAVX,
int I) {
421 __m128 _aa, _AA, _aA;
423 static const __m128 _1 = _mm_set1_ps(1);
424 static const __m128 _o = _mm_set1_ps(1.
e-12);
425 static const __m128
sm = _mm_set1_ps(-0.
f);
426 static const __m128 _En= _mm_set1_ps(En);
428 __m128* _et = (__m128*)pAVX[0];
429 __m128* _mk = (__m128*)pAVX[1];
430 __m128 _ee = _mm_setzero_ps();
431 __m128 _EE = _mm_setzero_ps();
432 __m128 _NN = _mm_setzero_ps();
434 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
435 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
436 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
437 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
438 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
439 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
440 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
441 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
443 NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
444 __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
445 __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
446 __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
447 __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
448 __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
449 __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
450 __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
452 for(
int i=0;
i<I;
i+=4) {
454 _aa = _mm_mul_ps(*_p0,*_p0); *_u0++ = *_p0++;
455 _AA = _mm_mul_ps(*_q0,*_q0); *_v0++ = *_q0++; ,
456 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1)); *_u1++ = *_p1++;
457 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1)); *_v1++ = *_q1++; ,
458 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2)); *_u2++ = *_p2++;
459 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2)); *_v2++ = *_q2++; ,
460 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3)); *_u3++ = *_p3++;
461 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3)); *_v3++ = *_q3++; ,
462 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4)); *_u4++ = *_p4++;
463 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4)); *_v4++ = *_q4++; ,
464 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5)); *_u5++ = *_p5++;
465 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5)); *_v5++ = *_q5++; ,
466 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6)); *_u6++ = *_p6++;
467 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6)); *_v6++ = *_q6++; ,
468 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7)); *_u7++ = *_p7++;
469 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7)); *_v7++ = *_q7++; )
474 *_et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
475 *_mk = _mm_and_ps(_mm_cmpgt_ps(*_et,_En),_1);
476 _NN = _mm_add_ps(_NN,*_mk);
477 _ee = _mm_add_ps(_ee,*_et);
478 *_et = _mm_mul_ps(*_et,*_mk++);
479 _EE = _mm_add_ps(_EE,*_et++);
486 float** d,
float**
D,
487 float**
h,
float** H,
int I) {
491 NETX(__m128* _n0 = (__m128*)n[0]; __m128* _N0 = (__m128*)N[0]; ,
492 __m128* _n1 = (__m128*)n[1]; __m128* _N1 = (__m128*)N[1]; ,
493 __m128* _n2 = (__m128*)n[2]; __m128* _N2 = (__m128*)N[2]; ,
494 __m128* _n3 = (__m128*)n[3]; __m128* _N3 = (__m128*)N[3]; ,
495 __m128* _n4 = (__m128*)n[4]; __m128* _N4 = (__m128*)N[4]; ,
496 __m128* _n5 = (__m128*)n[5]; __m128* _N5 = (__m128*)N[5]; ,
497 __m128* _n6 = (__m128*)n[6]; __m128* _N6 = (__m128*)N[6]; ,
498 __m128* _n7 = (__m128*)n[7]; __m128* _N7 = (__m128*)N[7]; )
500 NETX(__m128* _d0 = (__m128*)d[0]; __m128* _D0 = (__m128*)D[0]; ,
501 __m128* _d1 = (__m128*)d[1]; __m128* _D1 = (__m128*)D[1]; ,
502 __m128* _d2 = (__m128*)d[2]; __m128* _D2 = (__m128*)D[2]; ,
503 __m128* _d3 = (__m128*)d[3]; __m128* _D3 = (__m128*)D[3]; ,
504 __m128* _d4 = (__m128*)d[4]; __m128* _D4 = (__m128*)D[4]; ,
505 __m128* _d5 = (__m128*)d[5]; __m128* _D5 = (__m128*)D[5]; ,
506 __m128* _d6 = (__m128*)d[6]; __m128* _D6 = (__m128*)D[6]; ,
507 __m128* _d7 = (__m128*)d[7]; __m128* _D7 = (__m128*)D[7]; )
509 NETX(__m128* _h0 = (__m128*)h[0]; __m128* _H0 = (__m128*)H[0]; ,
510 __m128* _h1 = (__m128*)h[1]; __m128* _H1 = (__m128*)H[1]; ,
511 __m128* _h2 = (__m128*)h[2]; __m128* _H2 = (__m128*)H[2]; ,
512 __m128* _h3 = (__m128*)h[3]; __m128* _H3 = (__m128*)H[3]; ,
513 __m128* _h4 = (__m128*)h[4]; __m128* _H4 = (__m128*)H[4]; ,
514 __m128* _h5 = (__m128*)h[5]; __m128* _H5 = (__m128*)H[5]; ,
515 __m128* _h6 = (__m128*)h[6]; __m128* _H6 = (__m128*)H[6]; ,
516 __m128* _h7 = (__m128*)h[7]; __m128* _H7 = (__m128*)H[7]; )
518 for(
int i=0;
i<I;
i+=4) {
519 NETX(*_n0++ = _mm_sub_ps(*_d0++,*_h0++); *_N0++ = _mm_sub_ps(*_D0++,*_H0++); ,
520 *_n1++ = _mm_sub_ps(*_d1++,*_h1++); *_N1++ = _mm_sub_ps(*_D1++,*_H1++); ,
521 *_n2++ = _mm_sub_ps(*_d2++,*_h2++); *_N2++ = _mm_sub_ps(*_D2++,*_H2++); ,
522 *_n3++ = _mm_sub_ps(*_d3++,*_h3++); *_N3++ = _mm_sub_ps(*_D3++,*_H3++); ,
523 *_n4++ = _mm_sub_ps(*_d4++,*_h4++); *_N4++ = _mm_sub_ps(*_D4++,*_H4++); ,
524 *_n5++ = _mm_sub_ps(*_d5++,*_h5++); *_N5++ = _mm_sub_ps(*_D5++,*_H5++); ,
525 *_n6++ = _mm_sub_ps(*_d6++,*_h6++); *_N6++ = _mm_sub_ps(*_D6++,*_H6++); ,
526 *_n7++ = _mm_sub_ps(*_d7++,*_h7++); *_N7++ = _mm_sub_ps(*_D7++,*_H7++); )
533 std::vector<float*> &pAVX,
int I) {
540 __m128 _a, _A, _aa, _AA, _aA, _x, _cc, _ss, _nn, _cn, _sn, _mk;
542 static const __m128 _0 = _mm_set1_ps(0);
543 static const __m128 _1 = _mm_set1_ps(1);
544 static const __m128 _2 = _mm_set1_ps(2);
545 static const __m128 _o = _mm_set1_ps(0.0001);
546 static const __m128
sm = _mm_set1_ps(-0.
f);
548 float a[8] __attribute__((aligned(32))); __m128* _am = (__m128*)
a;
549 float A[8] __attribute__((aligned(32))); __m128* _AM = (__m128*)
A;
550 float s[8] __attribute__((aligned(32))); __m128* _si = (__m128*)
s;
551 float c[8] __attribute__((aligned(32))); __m128* _co = (__m128*)
c;
553 __m128* _MK = (__m128*)(pAVX[1]);
554 __m128 aa[8], AA[8], aA[8], si[8], co[8];
555 for(
int i=0;
i<8;
i++) {
556 aa[
i] = _mm_setzero_ps();
557 AA[
i] = _mm_setzero_ps();
558 aA[
i] = _mm_setzero_ps();
561 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
562 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
563 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
564 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
565 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
566 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
567 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
568 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
570 for(
int i=0;
i<I;
i+=4) {
571 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1);
573 aa[0] = _mm_add_ps(aa[0],_mm_mul_ps(_mm_mul_ps(*_p0,*_p0), _mk));
574 AA[0] = _mm_add_ps(AA[0],_mm_mul_ps(_mm_mul_ps(*_q0,*_q0), _mk));
575 aA[0] = _mm_add_ps(aA[0],_mm_mul_ps(_mm_mul_ps(*_p0++,*_q0++),_mk));,
577 aa[1] = _mm_add_ps(aa[1],_mm_mul_ps(_mm_mul_ps(*_p1,*_p1), _mk));
578 AA[1] = _mm_add_ps(AA[1],_mm_mul_ps(_mm_mul_ps(*_q1,*_q1), _mk));
579 aA[1] = _mm_add_ps(aA[1],_mm_mul_ps(_mm_mul_ps(*_p1++,*_q1++),_mk));,
581 aa[2] = _mm_add_ps(aa[2],_mm_mul_ps(_mm_mul_ps(*_p2,*_p2), _mk));
582 AA[2] = _mm_add_ps(AA[2],_mm_mul_ps(_mm_mul_ps(*_q2,*_q2), _mk));
583 aA[2] = _mm_add_ps(aA[2],_mm_mul_ps(_mm_mul_ps(*_p2++,*_q2++),_mk));,
585 aa[3] = _mm_add_ps(aa[3],_mm_mul_ps(_mm_mul_ps(*_p3,*_p3), _mk));
586 AA[3] = _mm_add_ps(AA[3],_mm_mul_ps(_mm_mul_ps(*_q3,*_q3), _mk));
587 aA[3] = _mm_add_ps(aA[3],_mm_mul_ps(_mm_mul_ps(*_p3++,*_q3++),_mk));,
589 aa[4] = _mm_add_ps(aa[4],_mm_mul_ps(_mm_mul_ps(*_p4,*_p4), _mk));
590 AA[4] = _mm_add_ps(AA[4],_mm_mul_ps(_mm_mul_ps(*_q4,*_q4), _mk));
591 aA[4] = _mm_add_ps(aA[4],_mm_mul_ps(_mm_mul_ps(*_p4++,*_q4++),_mk));,
593 aa[5] = _mm_add_ps(aa[5],_mm_mul_ps(_mm_mul_ps(*_p5,*_p5), _mk));
594 AA[5] = _mm_add_ps(AA[5],_mm_mul_ps(_mm_mul_ps(*_q5,*_q5), _mk));
595 aA[5] = _mm_add_ps(aA[5],_mm_mul_ps(_mm_mul_ps(*_p5++,*_q5++),_mk));,
597 aa[6] = _mm_add_ps(aa[6],_mm_mul_ps(_mm_mul_ps(*_p6,*_p6), _mk));
598 AA[6] = _mm_add_ps(AA[6],_mm_mul_ps(_mm_mul_ps(*_q6,*_q6), _mk));
599 aA[6] = _mm_add_ps(aA[6],_mm_mul_ps(_mm_mul_ps(*_p6++,*_q6++),_mk));,
601 aa[7] = _mm_add_ps(aa[7],_mm_mul_ps(_mm_mul_ps(*_p7,*_p7), _mk));
602 AA[7] = _mm_add_ps(AA[7],_mm_mul_ps(_mm_mul_ps(*_q7,*_q7), _mk));
603 aA[7] = _mm_add_ps(aA[7],_mm_mul_ps(_mm_mul_ps(*_p7++,*_q7++),_mk));)
607 _a = _mm_hadd_ps(aa[0],aa[1]); _A = _mm_hadd_ps(aa[2],aa[3]); _aa = _mm_hadd_ps(_a,_A);
608 _a = _mm_hadd_ps(AA[0],AA[1]); _A = _mm_hadd_ps(AA[2],AA[3]); _AA = _mm_hadd_ps(_a,_A);
609 _a = _mm_hadd_ps(aA[0],aA[1]); _A = _mm_hadd_ps(aA[2],aA[3]); _aA = _mm_hadd_ps(_a,_A);
611 *_si = _mm_mul_ps(_aA,_2);
612 *_co = _mm_sub_ps(_aa,_AA);
613 _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
614 _cc = _mm_mul_ps(*_co,*_co);
615 _ss = _mm_mul_ps(*_si,*_si);
616 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
617 *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2));
618 *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2);
619 *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM));
620 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
621 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
622 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
623 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
624 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
625 *_co = _mm_mul_ps(*_co,_ss);
628 _am++; _AM++; _si++; _co++;
629 _a = _mm_hadd_ps(aa[4],aa[5]); _A = _mm_hadd_ps(aa[6],aa[7]); _aa = _mm_hadd_ps(_a,_A);
630 _a = _mm_hadd_ps(AA[4],AA[5]); _A = _mm_hadd_ps(AA[6],AA[7]); _AA = _mm_hadd_ps(_a,_A);
631 _a = _mm_hadd_ps(aA[4],aA[5]); _A = _mm_hadd_ps(aA[6],aA[7]); _aA = _mm_hadd_ps(_a,_A);
633 *_si = _mm_mul_ps(_aA,_2);
634 *_co = _mm_sub_ps(_aa,_AA);
635 _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
636 _cc = _mm_mul_ps(*_co,*_co);
637 _ss = _mm_mul_ps(*_si,*_si);
638 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
639 *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2));
640 *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2);
641 *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM));
642 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
643 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
644 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
645 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
646 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
647 *_co = _mm_mul_ps(*_co,_ss);
652 NETX(q[0][II]=
a[0]; q[0][II+1]=
A[0]; q[0][II+2]=
s[0]; q[0][II+3]=
c[0]; q[0][II+5]=1;,
653 q[1][II]=
a[1]; q[1][II+1]=
A[1]; q[1][II+2]=
s[1]; q[1][II+3]=
c[1]; q[1][II+5]=1;,
654 q[2][II]=
a[2]; q[2][II+1]=
A[2]; q[2][II+2]=
s[2]; q[2][II+3]=
c[2]; q[2][II+5]=1;,
655 q[3][II]=
a[3]; q[3][II+1]=
A[3]; q[3][II+2]=
s[3]; q[3][II+3]=
c[3]; q[3][II+5]=1;,
656 q[4][II]=
a[4]; q[4][II+1]=
A[4]; q[4][II+2]=
s[4]; q[4][II+3]=
c[4]; q[4][II+5]=1;,
657 q[5][II]=
a[5]; q[5][II+1]=
A[5]; q[5][II+2]=
s[5]; q[5][II+3]=
c[5]; q[5][II+5]=1;,
658 q[6][II]=
a[6]; q[6][II+1]=
A[6]; q[6][II+2]=
s[6]; q[6][II+3]=
c[6]; q[6][II+5]=1;,
659 q[7][II]=
a[7]; q[7][II+1]=
A[7]; q[7][II+2]=
s[7]; q[7][II+3]=
c[7]; q[7][II+5]=1;)
662 NETX(q[0][II+4]=(
a[0]+
A[0]); Ep+=q[0][II+4]*q[0][II+4]/2; _p0 = (__m128*)p[0]; _q0 = (__m128*)q[0];,
663 q[1][II+4]=(
a[1]+
A[1]); Ep+=q[1][II+4]*q[1][II+4]/2; _p1 = (__m128*)p[1]; _q1 = (__m128*)q[1];,
664 q[2][II+4]=(
a[2]+
A[2]); Ep+=q[2][II+4]*q[2][II+4]/2; _p2 = (__m128*)p[2]; _q2 = (__m128*)q[2];,
665 q[3][II+4]=(
a[3]+
A[3]); Ep+=q[3][II+4]*q[3][II+4]/2; _p3 = (__m128*)p[3]; _q3 = (__m128*)q[3];,
666 q[4][II+4]=(
a[4]+
A[4]); Ep+=q[4][II+4]*q[4][II+4]/2; _p4 = (__m128*)p[4]; _q4 = (__m128*)q[4];,
667 q[5][II+4]=(
a[5]+
A[5]); Ep+=q[5][II+4]*q[5][II+4]/2; _p5 = (__m128*)p[5]; _q5 = (__m128*)q[5];,
668 q[6][II+4]=(
a[6]+
A[6]); Ep+=q[6][II+4]*q[6][II+4]/2; _p6 = (__m128*)p[6]; _q6 = (__m128*)q[6];,
669 q[7][II+4]=(
a[7]+
A[7]); Ep+=q[7][II+4]*q[7][II+4]/2; _p7 = (__m128*)p[7]; _q7 = (__m128*)q[7];)
671 *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o)); _am--;
672 *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o));
673 *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o)); _AM--;
674 *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o));
679 NETX(aa[0]=_mm_set1_ps(
a[0]); AA[0]=_mm_set1_ps(
A[0]); si[0]=_mm_set1_ps(
s[0]); co[0]=_mm_set1_ps(
c[0]); ,
680 aa[1]=_mm_set1_ps(
a[1]); AA[1]=_mm_set1_ps(
A[1]); si[1]=_mm_set1_ps(
s[1]); co[1]=_mm_set1_ps(
c[1]); ,
681 aa[2]=_mm_set1_ps(
a[2]); AA[2]=_mm_set1_ps(
A[2]); si[2]=_mm_set1_ps(
s[2]); co[2]=_mm_set1_ps(
c[2]); ,
682 aa[3]=_mm_set1_ps(
a[3]); AA[3]=_mm_set1_ps(
A[3]); si[3]=_mm_set1_ps(
s[3]); co[3]=_mm_set1_ps(
c[3]); ,
683 aa[4]=_mm_set1_ps(
a[4]); AA[4]=_mm_set1_ps(
A[4]); si[4]=_mm_set1_ps(
s[4]); co[4]=_mm_set1_ps(
c[4]); ,
684 aa[5]=_mm_set1_ps(
a[5]); AA[5]=_mm_set1_ps(
A[5]); si[5]=_mm_set1_ps(
s[5]); co[5]=_mm_set1_ps(
c[5]); ,
685 aa[6]=_mm_set1_ps(
a[6]); AA[6]=_mm_set1_ps(
A[6]); si[6]=_mm_set1_ps(
s[6]); co[6]=_mm_set1_ps(
c[6]); ,
686 aa[7]=_mm_set1_ps(
a[7]); AA[7]=_mm_set1_ps(
A[7]); si[7]=_mm_set1_ps(
s[7]); co[7]=_mm_set1_ps(
c[7]); )
688 _MK = (__m128*)(pAVX[1]);
689 for(
int i=0;
i<I;
i+=4) {
690 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1);
691 NETX(_a=_mm_add_ps(_mm_mul_ps(*_p0,co[0]),_mm_mul_ps(*_q0,si[0]));
692 _A=_mm_sub_ps(_mm_mul_ps(*_q0,co[0]),_mm_mul_ps(*_p0,si[0]));
693 *_p0++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[0]));
694 *_q0++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[0])); ,
696 _a=_mm_add_ps(_mm_mul_ps(*_p1,co[1]),_mm_mul_ps(*_q1,si[1]));
697 _A=_mm_sub_ps(_mm_mul_ps(*_q1,co[1]),_mm_mul_ps(*_p1,si[1]));
698 *_p1++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[1]));
699 *_q1++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[1])); ,
701 _a=_mm_add_ps(_mm_mul_ps(*_p2,co[2]),_mm_mul_ps(*_q2,si[2]));
702 _A=_mm_sub_ps(_mm_mul_ps(*_q2,co[2]),_mm_mul_ps(*_p2,si[2]));
703 *_p2++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[2]));
704 *_q2++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[2])); ,
706 _a=_mm_add_ps(_mm_mul_ps(*_p3,co[3]),_mm_mul_ps(*_q3,si[3]));
707 _A=_mm_sub_ps(_mm_mul_ps(*_q3,co[3]),_mm_mul_ps(*_p3,si[3]));
708 *_p3++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[3]));
709 *_q3++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[3])); ,
711 _a=_mm_add_ps(_mm_mul_ps(*_p4,co[4]),_mm_mul_ps(*_q4,si[4]));
712 _A=_mm_sub_ps(_mm_mul_ps(*_q4,co[4]),_mm_mul_ps(*_p4,si[4]));
713 *_p4++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[4]));
714 *_q4++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[4])); ,
716 _a=_mm_add_ps(_mm_mul_ps(*_p5,co[5]),_mm_mul_ps(*_q5,si[5]));
717 _A=_mm_sub_ps(_mm_mul_ps(*_q5,co[5]),_mm_mul_ps(*_p5,si[5]));
718 *_p5++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[5]));
719 *_q5++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[5])); ,
721 _a=_mm_add_ps(_mm_mul_ps(*_p6,co[6]),_mm_mul_ps(*_q6,si[6]));
722 _A=_mm_sub_ps(_mm_mul_ps(*_q6,co[6]),_mm_mul_ps(*_p6,si[6]));
723 *_p6++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[6]));
724 *_q6++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[6])); ,
726 _a=_mm_add_ps(_mm_mul_ps(*_p7,co[7]),_mm_mul_ps(*_q7,si[7]));
727 _A=_mm_sub_ps(_mm_mul_ps(*_q7,co[7]),_mm_mul_ps(*_p7,si[7]));
728 *_p7++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[7]));
729 *_q7++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[7])); )
735 std::vector<float*> &pAVX,
int I) {
743 float k = pAVX[1][I];
745 __m128 _a, _A, _n, _mk, _nn;
746 static const __m128 _o = _mm_set1_ps(1.
e-9);
747 static const __m128 _0 = _mm_set1_ps(0);
748 static const __m128 _1 = _mm_set1_ps(1);
749 static const __m128 _4 = _mm_set1_ps(4);
750 static const __m128 o5 = _mm_set1_ps(0.5);
752 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; __m128* _n0 = (__m128*)(q[0]+I); ,
753 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; __m128* _n1 = (__m128*)(q[1]+I); ,
754 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; __m128* _n2 = (__m128*)(q[2]+I); ,
755 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; __m128* _n3 = (__m128*)(q[3]+I); ,
756 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; __m128* _n4 = (__m128*)(q[4]+I); ,
757 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; __m128* _n5 = (__m128*)(q[5]+I); ,
758 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; __m128* _n6 = (__m128*)(q[6]+I); ,
759 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; __m128* _n7 = (__m128*)(q[7]+I); )
761 NETX(__m128 a0=_mm_set1_ps(q[0][I4]); __m128 s0=_mm_set1_ps(q[0][I2]); __m128 c0=_mm_set1_ps(q[0][I3]); ,
762 __m128 a1=_mm_set1_ps(q[1][I4]); __m128 s1=_mm_set1_ps(q[1][I2]); __m128
c1=_mm_set1_ps(q[1][I3]); ,
763 __m128
a2=_mm_set1_ps(q[2][I4]); __m128 s2=_mm_set1_ps(q[2][I2]); __m128
c2=_mm_set1_ps(q[2][I3]); ,
764 __m128 a3=_mm_set1_ps(q[3][I4]); __m128 s3=_mm_set1_ps(q[3][I2]); __m128
c3=_mm_set1_ps(q[3][I3]); ,
765 __m128 a4=_mm_set1_ps(q[4][I4]); __m128 s4=_mm_set1_ps(q[4][I2]); __m128 c4=_mm_set1_ps(q[4][I3]); ,
766 __m128 a5=_mm_set1_ps(q[5][I4]); __m128 s5=_mm_set1_ps(q[5][I2]); __m128 c5=_mm_set1_ps(q[5][I3]); ,
767 __m128 a6=_mm_set1_ps(q[6][I4]); __m128 s6=_mm_set1_ps(q[6][I2]); __m128 c6=_mm_set1_ps(q[6][I3]); ,
768 __m128 a7=_mm_set1_ps(q[7][I4]); __m128 s7=_mm_set1_ps(q[7][I2]); __m128 c7=_mm_set1_ps(q[7][I3]); )
770 __m128* _MK = (__m128*)pAVX[1];
771 __m128* _fp = (__m128*)pAVX[2];
772 __m128* _fx = (__m128*)pAVX[3];
773 __m128* _ee = (__m128*)pAVX[15];
774 __m128* _EE = (__m128*)pAVX[16];
775 __m128* _gn = (__m128*)pAVX[20];
777 __m128 _Np = _mm_setzero_ps();
779 for(
int i=0;
i<I;
i+=4) {
780 _mk = _mm_mul_ps(o5,_mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1));
782 _n = _mm_mul_ps(_mm_mul_ps(a0,_mk),*_n0); _a=*_p0; _A=*_q0; _nn=*_n0++;
783 *_p0++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c0),_mm_mul_ps(_A,s0)));
784 *_q0++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c0),_mm_mul_ps(_a,s0))); ,
785 _n = _mm_mul_ps(_mm_mul_ps(a1,_mk),*_n1); _a=*_p1; _A=*_q1; _nn=_mm_add_ps(_nn,*_n1++);
786 *_p1++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c1),_mm_mul_ps(_A,s1)));
787 *_q1++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c1),_mm_mul_ps(_a,s1))); ,
788 _n = _mm_mul_ps(_mm_mul_ps(a2,_mk),*_n2); _a=*_p2; _A=*_q2; _nn=_mm_add_ps(_nn,*_n2++);
789 *_p2++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c2),_mm_mul_ps(_A,s2)));
790 *_q2++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c2),_mm_mul_ps(_a,s2))); ,
791 _n = _mm_mul_ps(_mm_mul_ps(a3,_mk),*_n3); _a=*_p3; _A=*_q3; _nn=_mm_add_ps(_nn,*_n3++);
792 *_p3++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c3),_mm_mul_ps(_A,s3)));
793 *_q3++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c3),_mm_mul_ps(_a,s3))); ,
794 _n = _mm_mul_ps(_mm_mul_ps(a4,_mk),*_n4); _a=*_p4; _A=*_q4; _nn=_mm_add_ps(_nn,*_n4++);
795 *_p4++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c4),_mm_mul_ps(_A,s4)));
796 *_q4++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c4),_mm_mul_ps(_a,s4))); ,
797 _n = _mm_mul_ps(_mm_mul_ps(a5,_mk),*_n5); _a=*_p5; _A=*_q5; _nn=_mm_add_ps(_nn,*_n5++);
798 *_p5++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c5),_mm_mul_ps(_A,s5)));
799 *_q5++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c5),_mm_mul_ps(_a,s5))); ,
800 _n = _mm_mul_ps(_mm_mul_ps(a6,_mk),*_n6); _a=*_p6; _A=*_q6; _nn=_mm_add_ps(_nn,*_n6++);
801 *_p6++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c6),_mm_mul_ps(_A,s6)));
802 *_q6++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c6),_mm_mul_ps(_a,s6))); ,
803 _n = _mm_mul_ps(_mm_mul_ps(a7,_mk),*_n7); _a=*_p7; _A=*_q7; _nn=_mm_add_ps(_nn,*_n7++);
804 *_p7++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c7),_mm_mul_ps(_A,s7)));
805 *_q7++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c7),_mm_mul_ps(_a,s7))); )
807 _nn = _mm_mul_ps(_nn,_mk);
808 _Np = _mm_add_ps(_Np,_nn);
814 std::vector<float*> &pAVX,
int I) {
821 __m128 _a,_A,_aa,_AA,_aA,_et,_cc,_ss,_nn,_cn,_sn,_mk;
823 static const __m128 _0 = _mm_set1_ps(0);
824 static const __m128 _1 = _mm_set1_ps(1);
825 static const __m128 _2 = _mm_set1_ps(2);
826 static const __m128 _o = _mm_set1_ps(1.
e-21);
827 static const __m128 sm = _mm_set1_ps(-0.
f);
829 __m128* _MK = (__m128*)pAVX[1];
830 __m128* _si = (__m128*)pAVX[4];
831 __m128* _co = (__m128*)pAVX[5];
832 __m128* _ee = (__m128*)pAVX[15];
833 __m128* _EE = (__m128*)pAVX[16];
834 __m128 _e = _mm_setzero_ps();
835 __m128 _E = _mm_setzero_ps();
837 NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
838 __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
839 __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
840 __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
841 __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
842 __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
843 __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
844 __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
846 for(
int i=0;
i<I;
i+=4) {
848 _aa=_mm_mul_ps(*_p0,*_p0);
849 _AA=_mm_mul_ps(*_q0,*_q0);
850 _aA=_mm_mul_ps(*_p0++,*_q0++); ,
852 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1));
853 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1));
854 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p1++,*_q1++)); ,
856 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2));
857 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2));
858 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p2++,*_q2++)); ,
860 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3));
861 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3));
862 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p3++,*_q3++)); ,
864 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4));
865 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4));
866 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p4++,*_q4++)); ,
868 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5));
869 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5));
870 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p5++,*_q5++)); ,
872 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6));
873 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6));
874 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p6++,*_q6++)); ,
876 _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7));
877 _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7));
878 _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p7++,*_q7++)); )
882 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1);
883 *_si = _mm_mul_ps(_aA,_2);
884 *_co = _mm_sub_ps(_aa,_AA);
885 _et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o);
886 _cc = _mm_mul_ps(*_co,*_co);
887 _ss = _mm_mul_ps(*_si,*_si);
888 _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss));
889 *_ee = _mm_div_ps(_mm_add_ps(_et,_nn),_2);
890 *_EE = _mm_div_ps(_mm_sub_ps(_et,_nn),_2);
891 _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o));
892 _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1);
893 _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1);
894 *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2));
895 *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2));
896 *_co = _mm_mul_ps(*_co,_ss);
898 _e = _mm_add_ps(_e,_mm_mul_ps(_mk,*_ee));
899 _E = _mm_add_ps(_E,_mm_mul_ps(_mk,*_EE));
900 _si++; _co++; _MK++; _ee++; _EE++;
907 float**
s,
float**
S,
908 std::vector<float*> &pAVX,
int I) {
918 float k = pAVX[1][I];
919 __m128 _a,_A,_x,_X,_c,_C,_s,_S,_r,_R,_rr,_RR,_xs,_XS;
920 __m128 _ll,_ei,_cc,_mk,_mm,_ss,_SS;
921 static const __m128 _o = _mm_set1_ps(0.001);
922 static const __m128 _0 = _mm_set1_ps(0);
923 static const __m128 _1 = _mm_set1_ps(1);
924 static const __m128 _2 = _mm_set1_ps(2);
925 static const __m128 _k = _mm_set1_ps(2*(1-k));
926 static const __m128 sm = _mm_set1_ps(-0.
f);
928 __m128* _et = (__m128*)pAVX[0];
929 __m128* _MK = (__m128*)pAVX[1];
930 __m128* _si = (__m128*)pAVX[4];
931 __m128* _co = (__m128*)pAVX[5];
932 __m128* _ec = (__m128*)pAVX[19];
933 __m128* _gn = (__m128*)pAVX[20];
934 __m128* _ed = (__m128*)pAVX[21];
935 __m128* _rn = (__m128*)pAVX[22];
937 __m128 _LL = _mm_setzero_ps();
938 __m128 _Lr = _mm_setzero_ps();
939 __m128 _EC = _mm_setzero_ps();
940 __m128 _GN = _mm_setzero_ps();
941 __m128 _RN = _mm_setzero_ps();
942 __m128 _NN = _mm_setzero_ps();
944 NETX(__m128* _x0 = (__m128*)x[0]; __m128* _X0 = (__m128*)X[0]; ,
945 __m128* _x1 = (__m128*)x[1]; __m128* _X1 = (__m128*)X[1]; ,
946 __m128* _x2 = (__m128*)x[2]; __m128* _X2 = (__m128*)X[2]; ,
947 __m128* _x3 = (__m128*)x[3]; __m128* _X3 = (__m128*)X[3]; ,
948 __m128* _x4 = (__m128*)x[4]; __m128* _X4 = (__m128*)X[4]; ,
949 __m128* _x5 = (__m128*)x[5]; __m128* _X5 = (__m128*)X[5]; ,
950 __m128* _x6 = (__m128*)x[6]; __m128* _X6 = (__m128*)X[6]; ,
951 __m128* _x7 = (__m128*)x[7]; __m128* _X7 = (__m128*)X[7]; )
953 NETX(__m128* _s0 = (__m128*)s[0]; __m128* _S0 = (__m128*)S[0]; ,
954 __m128* _s1 = (__m128*)s[1]; __m128* _S1 = (__m128*)S[1]; ,
955 __m128* _s2 = (__m128*)s[2]; __m128* _S2 = (__m128*)S[2]; ,
956 __m128* _s3 = (__m128*)s[3]; __m128* _S3 = (__m128*)S[3]; ,
957 __m128* _s4 = (__m128*)s[4]; __m128* _S4 = (__m128*)S[4]; ,
958 __m128* _s5 = (__m128*)s[5]; __m128* _S5 = (__m128*)S[5]; ,
959 __m128* _s6 = (__m128*)s[6]; __m128* _S6 = (__m128*)S[6]; ,
960 __m128* _s7 = (__m128*)s[7]; __m128* _S7 = (__m128*)S[7]; )
962 for(
int i=0;
i<I;
i+=4) {
964 _s=_mm_add_ps(_mm_mul_ps(*_s0,*_co),_mm_mul_ps(*_S0,*_si)); _r=_mm_sub_ps(*_s0,*_x0);
965 _x=_mm_add_ps(_mm_mul_ps(*_x0,*_co),_mm_mul_ps(*_X0,*_si)); _R=_mm_sub_ps(*_S0,*_X0);
966 _S=_mm_sub_ps(_mm_mul_ps(*_S0++,*_co),_mm_mul_ps(*_s0++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_a;
967 _X=_mm_sub_ps(_mm_mul_ps(*_X0++,*_co),_mm_mul_ps(*_x0++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_A;
968 _c=_mm_mul_ps(_a,_a); _ss=_mm_mul_ps(_s,_s); _rr=_mm_mul_ps(_r,_r);
969 _C=_mm_mul_ps(_A,_A); _SS=_mm_mul_ps(_S,_S); _RR=_mm_mul_ps(_R,_R); ,
971 _s=_mm_add_ps(_mm_mul_ps(*_s1,*_co), _mm_mul_ps(*_S1,*_si)); _r=_mm_sub_ps(*_s1,*_x1);
972 _x=_mm_add_ps(_mm_mul_ps(*_x1,*_co), _mm_mul_ps(*_X1,*_si)); _R=_mm_sub_ps(*_S1,*_X1);
973 _S=_mm_sub_ps(_mm_mul_ps(*_S1++,*_co),_mm_mul_ps(*_s1++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
974 _X=_mm_sub_ps(_mm_mul_ps(*_X1++,*_co),_mm_mul_ps(*_x1++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
975 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
976 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
978 _s=_mm_add_ps(_mm_mul_ps(*_s2,*_co), _mm_mul_ps(*_S2,*_si)); _r=_mm_sub_ps(*_s2,*_x2);
979 _x=_mm_add_ps(_mm_mul_ps(*_x2,*_co), _mm_mul_ps(*_X2,*_si)); _R=_mm_sub_ps(*_S2,*_X2);
980 _S=_mm_sub_ps(_mm_mul_ps(*_S2++,*_co),_mm_mul_ps(*_s2++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
981 _X=_mm_sub_ps(_mm_mul_ps(*_X2++,*_co),_mm_mul_ps(*_x2++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
982 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
983 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
985 _s=_mm_add_ps(_mm_mul_ps(*_s3,*_co), _mm_mul_ps(*_S3,*_si)); _r=_mm_sub_ps(*_s3,*_x3);
986 _x=_mm_add_ps(_mm_mul_ps(*_x3,*_co), _mm_mul_ps(*_X3,*_si)); _R=_mm_sub_ps(*_S3,*_X3);
987 _S=_mm_sub_ps(_mm_mul_ps(*_S3++,*_co),_mm_mul_ps(*_s3++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
988 _X=_mm_sub_ps(_mm_mul_ps(*_X3++,*_co),_mm_mul_ps(*_x3++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
989 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
990 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
992 _s=_mm_add_ps(_mm_mul_ps(*_s4,*_co), _mm_mul_ps(*_S4,*_si)); _r=_mm_sub_ps(*_s4,*_x4);
993 _x=_mm_add_ps(_mm_mul_ps(*_x4,*_co), _mm_mul_ps(*_X4,*_si)); _R=_mm_sub_ps(*_S4,*_X4);
994 _S=_mm_sub_ps(_mm_mul_ps(*_S4++,*_co),_mm_mul_ps(*_s4++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
995 _X=_mm_sub_ps(_mm_mul_ps(*_X4++,*_co),_mm_mul_ps(*_x4++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
996 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
997 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
999 _s=_mm_add_ps(_mm_mul_ps(*_s5,*_co), _mm_mul_ps(*_S5,*_si)); _r=_mm_sub_ps(*_s5,*_x5);
1000 _x=_mm_add_ps(_mm_mul_ps(*_x5,*_co), _mm_mul_ps(*_X5,*_si)); _R=_mm_sub_ps(*_S5,*_X5);
1001 _S=_mm_sub_ps(_mm_mul_ps(*_S5++,*_co),_mm_mul_ps(*_s5++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1002 _X=_mm_sub_ps(_mm_mul_ps(*_X5++,*_co),_mm_mul_ps(*_x5++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1003 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1004 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1006 _s=_mm_add_ps(_mm_mul_ps(*_s6,*_co), _mm_mul_ps(*_S6,*_si)); _r=_mm_sub_ps(*_s6,*_x6);
1007 _x=_mm_add_ps(_mm_mul_ps(*_x6,*_co), _mm_mul_ps(*_X6,*_si)); _R=_mm_sub_ps(*_S6,*_X6);
1008 _S=_mm_sub_ps(_mm_mul_ps(*_S6++,*_co),_mm_mul_ps(*_s6++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1009 _X=_mm_sub_ps(_mm_mul_ps(*_X6++,*_co),_mm_mul_ps(*_x6++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1010 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1011 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1013 _s=_mm_add_ps(_mm_mul_ps(*_s7,*_co), _mm_mul_ps(*_S7,*_si)); _r=_mm_sub_ps(*_s7,*_x7);
1014 _x=_mm_add_ps(_mm_mul_ps(*_x7,*_co), _mm_mul_ps(*_X7,*_si)); _R=_mm_sub_ps(*_S7,*_X7);
1015 _S=_mm_sub_ps(_mm_mul_ps(*_S7++,*_co),_mm_mul_ps(*_s7++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1016 _X=_mm_sub_ps(_mm_mul_ps(*_X7++,*_co),_mm_mul_ps(*_x7++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1017 _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1018 _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); )
1021 _mk = _mm_and_ps(_mm_cmpge_ps(*_MK,_0),_1);
1022 _c = _mm_div_ps(_c,_mm_add_ps(_mm_mul_ps(_xs,_xs),_o));
1023 _C = _mm_div_ps(_C,_mm_add_ps(_mm_mul_ps(_XS,_XS),_o));
1024 _ll = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS));
1025 _ss = _mm_mul_ps(_ss,_mm_sub_ps(_1,_c));
1026 _SS = _mm_mul_ps(_SS,_mm_sub_ps(_1,_C));
1027 *_ec = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS));
1028 *_gn = _mm_mul_ps(_mk,_mm_mul_ps(_2,*_MK));
1029 *_rn = _mm_mul_ps(_mk,_mm_add_ps(_rr,_RR));
1031 _a = _mm_mul_ps(_2,_mm_andnot_ps(sm,*_ec));
1032 _A = _mm_add_ps(*_rn,_mm_add_ps(_o,*_gn));
1033 _cc = _mm_div_ps(*_ec,_mm_add_ps(_a,_A));
1034 _Lr = _mm_add_ps(_Lr,_mm_mul_ps(_ll,_cc));
1035 _mm = _mm_and_ps(_mm_cmpgt_ps(*_ec,_o),_1);
1037 _LL = _mm_add_ps(_LL, _ll);
1038 _GN = _mm_add_ps(_GN,*_gn);
1039 _EC = _mm_add_ps(_EC,*_ec);
1040 _RN = _mm_add_ps(_RN,*_rn);
1041 _NN = _mm_add_ps(_NN, _mm);
1043 _si++; _co++; _MK++; _ec++; _gn++; _ed++; _rn++;
1049 float ch = rn/(k*nn+sqrt(nn))/2;
1050 return _mm256_set_ps(0,0,0,0,rn/2,nn,ec,2*cc);
1054 static inline __m256
_avx_noise_ps(
float** p,
float** q, std::vector<float*> &pAVX,
int I) {
1058 float k = pAVX[1][I];
1059 __m128 _nx,_ns,_nm,_mk,_gg,_rc,_nn;
1060 __m128* _et = (__m128*)pAVX[0];
1061 __m128* _MK = (__m128*)pAVX[1];
1062 __m128* _ec = (__m128*)pAVX[19];
1063 __m128* _gn = (__m128*)pAVX[20];
1064 __m128* _rn = (__m128*)pAVX[22];
1065 __m128 _GN = _mm_setzero_ps();
1066 __m128 _RC = _mm_setzero_ps();
1067 __m128 _ES = _mm_setzero_ps();
1068 __m128 _EH = _mm_setzero_ps();
1069 __m128 _EC = _mm_setzero_ps();
1070 __m128 _SC = _mm_setzero_ps();
1071 __m128 _NC = _mm_setzero_ps();
1072 __m128 _NS = _mm_setzero_ps();
1074 static const __m128 _0 = _mm_set1_ps(0);
1075 static const __m128 o5 = _mm_set1_ps(0.5);
1076 static const __m128 _o = _mm_set1_ps(1.
e-9);
1077 static const __m128 _1 = _mm_set1_ps(1);
1078 static const __m128 _2 = _mm_set1_ps(2);
1079 static const __m128 _k = _mm_set1_ps(1./k);
1081 NETX(__m128* _s0 = (__m128*)(p[0]+I);, __m128* _s1 = (__m128*)(p[1]+I);,
1082 __m128* _s2 = (__m128*)(p[2]+I);, __m128* _s3 = (__m128*)(p[3]+I);,
1083 __m128* _s4 = (__m128*)(p[4]+I);, __m128* _s5 = (__m128*)(p[5]+I);,
1084 __m128* _s6 = (__m128*)(p[6]+I);, __m128* _s7 = (__m128*)(p[7]+I);)
1086 NETX(__m128* _x0 = (__m128*)(q[0]+I);, __m128* _x1 = (__m128*)(q[1]+I);,
1087 __m128* _x2 = (__m128*)(q[2]+I);, __m128* _x3 = (__m128*)(q[3]+I);,
1088 __m128* _x4 = (__m128*)(q[4]+I);, __m128* _x5 = (__m128*)(q[5]+I);,
1089 __m128* _x6 = (__m128*)(q[6]+I);, __m128* _x7 = (__m128*)(q[7]+I);)
1091 for(
int i=0;
i<I;
i+=4) {
1092 NETX(_ns =*_s0++;, _ns = _mm_add_ps(*_s1++,_ns); ,
1093 _ns = _mm_add_ps(*_s2++,_ns);, _ns = _mm_add_ps(*_s3++,_ns); ,
1094 _ns = _mm_add_ps(*_s4++,_ns);, _ns = _mm_add_ps(*_s5++,_ns); ,
1095 _ns = _mm_add_ps(*_s6++,_ns);, _ns = _mm_add_ps(*_s7++,_ns); )
1097 NETX(_nx =*_x0++;, _nx = _mm_add_ps(*_x1++,_nx); ,
1098 _nx = _mm_add_ps(*_x2++,_nx);, _nx = _mm_add_ps(*_x3++,_nx); ,
1099 _nx = _mm_add_ps(*_x4++,_nx);, _nx = _mm_add_ps(*_x5++,_nx); ,
1100 _nx = _mm_add_ps(*_x6++,_nx);, _nx = _mm_add_ps(*_x7++,_nx); )
1102 _ns = _mm_mul_ps(_ns,_k);
1103 _nx = _mm_mul_ps(_nx,_k);
1104 _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1);
1105 _nm = _mm_and_ps(_mm_cmpgt_ps(_nx,_0),_1);
1106 _nm = _mm_mul_ps(_mk,_nm);
1107 _EC = _mm_add_ps(_EC,_mm_mul_ps(_nm,*_ec));
1108 _NC = _mm_add_ps(_NC,_nm);
1110 _nm = _mm_sub_ps(_mk,_nm);
1111 _ES = _mm_add_ps(_ES,_mm_mul_ps(_nm,*_rn));
1113 _rc = _mm_and_ps(_mm_cmplt_ps(*_gn,_2),_1);
1114 _nn = _mm_mul_ps(*_gn,_mm_sub_ps(_1,_rc));
1115 _rc = _mm_add_ps(_rc,_mm_mul_ps(_nn,o5));
1116 _rc = _mm_div_ps(*_ec,_mm_add_ps(_rc,_o));
1118 _nm = _mm_and_ps(_mm_cmpgt_ps(_ns,_0),_1);
1119 _nm = _mm_mul_ps(_mk,_nm);
1120 *_gn = _mm_mul_ps(_mk,_mm_mul_ps(*_gn,_nx));
1121 _SC = _mm_add_ps(_SC,_mm_mul_ps(_nm,*_ec));
1122 _RC = _mm_add_ps(_RC,_mm_mul_ps(_nm, _rc));
1123 _GN = _mm_add_ps(_GN,_mm_mul_ps(_nm,*_gn));
1124 _NS = _mm_add_ps(_NS,_nm);
1126 _nm = _mm_sub_ps(_mk,_nm);
1127 _EH = _mm_add_ps(_EH,_mm_mul_ps(_nm,*_et));
1129 _MK++; _gn++; _et++; _ec++; _rn++;
1139 return _mm256_set_ps(ns,nc,es,eh,rc/(sc+0.01),sc-ec,ec,gn);
static float _avx_dpf_ps(double **Fp, double **Fx, int l, std::vector< float * > &pAPN, std::vector< float * > &pAVX, int I)
wavearray< double > a(hp.size())
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\t layers : "<< nLAYERS<< "\t dF(hz) : "<< dF<< "\t dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1)*itime+ifreq;double time=itime *dT;double freq=(ifreq >0)?ifreq *dF:dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _avx_loadata_ps(float **p, float **q, float **u, float **v, float En, std::vector< float * > &pAVX, int I)
static __m256 _avx_noise_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
static __m256 _avx_stat_ps(float **x, float **X, float **s, float **S, std::vector< float * > &pAVX, int I)
static void _avx_free_ps(std::vector< float * > &v)
static float _avx_setAMP_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
static void _avx_loadNULL_ps(float **n, float **N, float **d, float **D, float **h, float **H, int I)
static float _avx_ort_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
static float _avx_GW_ps(float **p, float **q, std::vector< float * > &pAPN, float *rr, std::vector< float * > &pAVX, int II)
static void _avx_cpf_ps(float **p, float **q, float **u, float **v, int I)
static float _wat_hsum(__m128 y)
Meyer< double > S(1024, 2)
static float _avx_packet_ps(float **p, float **q, std::vector< float * > &pAVX, int I)