21 _NET(_mm_storeu_ps(x,_p[0]);
printf(
"%e %e %e %e ",x[0],x[1],x[2],x[3]);,
22 _mm_storeu_ps(x,_p[1]);
printf(
"%e %e %e %e ",x[0],x[1],x[2],x[3]);)
27 _NET(_p[0] = _mm_setzero_ps();,
28 _p[1] = _mm_setzero_ps();)
33 _NET(_p[0] = _mm_load1_ps(&a);,
34 _p[1] = _mm_load1_ps(&a);)
39 __m128 _b = _mm_load1_ps(&b);
40 _NET(_a[0] = _mm_mul_ps(_a[0],_b);,
41 _a[1] = _mm_mul_ps(_a[1],_b);)
45 _NET(_a[0] = _mm_mul_ps(_a[0],_b[0]);,
46 _a[1] = _mm_mul_ps(_a[1],_b[1]);)
49 static inline float _sse_mul_ps(__m128* _a, __m128* _b, __m128* _o) {
52 _NET(_o[0] = _mm_mul_ps(_a[0],_b[0]);_mm_storeu_ps(x,_o[0]); out = XSUM(x);,
53 _o[1] = _mm_mul_ps(_a[1],_b[1]);_mm_storeu_ps(x,_o[1]); out+= YSUM(x);)
58 float c[4]; _mm_storeu_ps(c,_c);
60 _NET(_c=_mm_load1_ps( c );*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
61 _NET(_c=_mm_load1_ps(c+1);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
62 _NET(_c=_mm_load1_ps(c+2);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
63 _NET(_c=_mm_load1_ps(c+3);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
66 static inline void _sse_hard4_ps(__m128* _uu, __m128* _am, __m128* _AM, __m128 _c) {
71 float c[4]; _mm_storeu_ps(c,_c);
77 _r = _mm_set1_ps(c[0]); _R = _mm_set1_ps(1-c[0]);
78 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
79 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
82 _r = _mm_set1_ps(c[1]); _R = _mm_set1_ps(1-c[1]);
83 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
84 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
87 _r = _mm_set1_ps(c[2]); _R = _mm_set1_ps(1-c[2]);
88 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
89 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
92 _r = _mm_set1_ps(c[3]); _R = _mm_set1_ps(1-c[3]);
93 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
94 *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
100 float c[4]; _mm_storeu_ps(c,_c);
104 _NET(_1 = _mm_set1_ps(c[0]); _0 = _mm_set1_ps(1-c[0]);
105 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
106 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
107 _NET(_1 = _mm_set1_ps(c[1]); _0 = _mm_set1_ps(1-c[1]);
108 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
109 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
110 _NET(_1 = _mm_set1_ps(c[2]); _0 = _mm_set1_ps(1-c[2]);
111 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
112 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
113 _NET(_1 = _mm_set1_ps(c[3]); _0 = _mm_set1_ps(1-c[3]);
114 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
115 *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
123 _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
124 _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
133 _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]))); out = XSUM(x);,
134 _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]))); out+= YSUM(x);
145 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x);,
146 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
149 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x);,
150 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
153 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x);,
154 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
157 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x);,
158 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
160 return _mm_load_ps(o);
167 _mm_set1_ps(1.
e-24))));
177 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x)+1.e-24;,
178 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
181 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x)+1.e-24;,
182 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
185 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x)+1.e-24;,
186 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
189 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x)+1.e-24;,
190 _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
192 return _mm_div_ps(_mm_set1_ps(1.),_mm_sqrt_ps(_mm_load_ps(o)));
199 _mm_storeu_ps(x,_mm_mul_ps(_a[0],_b[0])); out = XSUM(x);,
200 _mm_storeu_ps(x,_mm_mul_ps(_a[1],_b[1])); out+= YSUM(x);
208 __m128* _o = (__m128*) o;
212 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0] = XSUM(x);,
213 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0]+= YSUM(x);
216 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1] = XSUM(x);,
217 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1]+= YSUM(x);
220 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2] = XSUM(x);,
221 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2]+= YSUM(x);
224 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3] = XSUM(x);,
225 _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3]+= YSUM(x);
232 _NET(_a[0] = _mm_add_ps(_a[0],_b[0]);,
233 _a[1] = _mm_add_ps(_a[1],_b[1]);)
237 static inline void _sse_add_ps(__m128* _a, __m128* _b, __m128 _c) {
239 _NET(_a[0] = _mm_add_ps(_a[0],_mm_mul_ps(_b[0],_c));,
240 _a[1] = _mm_add_ps(_a[1],_mm_mul_ps(_b[1],_c));)
249 float c[4]; _mm_storeu_ps(c,_c);
252 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
253 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
254 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
255 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
256 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
257 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
258 _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
259 *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
265 _NET(_a[0] = _mm_sub_ps(_a[0],_b[0]);,
266 _a[1] = _mm_sub_ps(_a[1],_b[1]);)
275 float c[4]; _mm_storeu_ps(c,_c);
278 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
279 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
280 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
281 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
282 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
283 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
284 _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
285 *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
290 _NET(_mm_storeu_ps(a,*_p);,
291 _mm_storeu_ps(a+4,*(_p+1));)
296 _NET(*_a = *_p;, *(_a+1) = *(_p+1);)
302 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
303 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
304 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
305 _NET(*_a++ = *_p++;, *_a++ = *_p++;)
310 __m128 _b = _mm_load1_ps(&b);
311 _NET(_mm_storeu_ps(a,_mm_mul_ps(*_p,_b));,
312 _mm_storeu_ps(a+4,_mm_mul_ps(*(_p+1),_b));)
321 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
322 _mm_storeu_ps(a,*_p++); a+=4;)
323 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
324 _mm_storeu_ps(a,*_p++); a+=4;)
325 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
326 _mm_storeu_ps(a,*_p++); a+=4;)
327 _NET(_mm_storeu_ps(a,*_p++); a+=4;,
328 _mm_storeu_ps(a,*_p++); a+=4;)
339 float c[4]; _mm_storeu_ps(c,_c);
343 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));,
344 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));)
345 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));,
346 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));)
347 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));,
348 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));)
349 _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));,
350 *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));)
354 static inline float _sse_nrg_ps(__m128* _u,
float c, __m128* _v,
float s, __m128* _a) {
359 __m128 _c = _mm_load1_ps(&c);
360 __m128 _s = _mm_load1_ps(&s);
362 _NET(_b = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(*_u,_c), _mm_mul_ps(*_v,_s)));
363 _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out = XSUM(x);,
364 _b = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(*(_u+1),_c), _mm_mul_ps(*(_v+1), _s)));
365 _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out+= YSUM(x);)
370 static inline void _sse_rotadd_ps(__m128* _u,
float c, __m128* _v,
float s, __m128* _a) {
372 __m128 _c = _mm_load1_ps(&c);
373 __m128 _s = _mm_load1_ps(&s);
375 _a[0] = _mm_add_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));,
376 _a[1] = _mm_add_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1],_s)));
381 static inline float _sse_rotsub_ps(__m128* _u,
float c, __m128* _v,
float s, __m128* _a) {
385 __m128 _c = _mm_load1_ps(&c);
386 __m128 _s = _mm_load1_ps(&s);
389 _a[0] = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));
390 _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
391 _a[1] = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1], _s)));
392 _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
398 static inline void _sse_rotp_ps(__m128* u,
float* c, __m128*
v,
float*
s, __m128* a) {
401 a[0] = _mm_add_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
402 a[1] = _mm_add_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
406 static inline void _sse_rotm_ps(__m128* u,
float* c, __m128* v,
float* s, __m128* a) {
409 a[0] = _mm_sub_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
410 a[1] = _mm_sub_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
414 static inline __m128
_sse_rotp_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
416 return _mm_add_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
419 static inline __m128
_sse_rotm_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
421 return _mm_sub_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
424 static inline void _sse_rot4p_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
431 _mm_storeu_ps(c,*_c);
432 _mm_storeu_ps(s,*_s);
437 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
438 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
441 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
442 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
445 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
446 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
449 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
450 *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
454 static inline void _sse_rot4m_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
461 _mm_storeu_ps(c,*_c);
462 _mm_storeu_ps(s,*_s);
467 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
468 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
471 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
472 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
475 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
476 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
479 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
480 *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
486 NETX(_p[0] = (__m128*) (p[0] + m[0][l]*n);,
487 _p[1] = (__m128*) (p[1] + m[1][l]*n);,
488 _p[2] = (__m128*) (p[2] + m[2][l]*n);,
489 _p[3] = (__m128*) (p[3] + m[3][l]*n);,
490 _p[4] = (__m128*) (p[4] + m[4][l]*n);,
491 _p[5] = (__m128*) (p[5] + m[5][l]*n);,
492 _p[6] = (__m128*) (p[6] + m[6][l]*n);,
493 _p[7] = (__m128*) (p[7] + m[7][l]*n);)
498 __m128 _q = _mm_setzero_ps();
499 NETX(_q = _mm_add_ps(_q, *_p[0]);,
500 _q = _mm_add_ps(_q, *_p[1]);,
501 _q = _mm_add_ps(_q, *_p[2]);,
502 _q = _mm_add_ps(_q, *_p[3]);,
503 _q = _mm_add_ps(_q, *_p[4]);,
504 _q = _mm_add_ps(_q, *_p[5]);,
505 _q = _mm_add_ps(_q, *_p[6]);,
506 _q = _mm_add_ps(_q, *_p[7]);)
510 static inline __m128
_sse_cut_ps(__m128* _pE, __m128** _pe, __m128 _Es, __m128 _cmp) {
511 NETX(_cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[0]++),_Es));,
512 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[1]++),_Es));,
513 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[2]++),_Es));,
514 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[3]++),_Es));,
515 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[4]++),_Es));,
516 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[5]++),_Es));,
517 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[6]++),_Es));,
518 _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[7]++),_Es));)
525 NETX(*_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[0]++));,
526 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[1]++));,
527 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[2]++));,
528 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[3]++));,
529 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[4]++));,
530 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[5]++));,
531 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[6]++));,
532 *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[7]++));
542 __m128 _o2 = _mm_setzero_ps();
544 _o1 = _mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]));,
545 _o2 = _mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]));
547 _o1 = _mm_max_ps(_o1,_o2); _mm_storeu_ps(x,_o1); out=x[0];
548 if(out<x[1]) out=x[1];
549 if(out<x[2]) out=x[2];
550 if(out<x[3]) out=x[3];
554 static inline void _sse_ort4_ps(__m128* _u, __m128* _v, __m128* _s, __m128* _c) {
558 static const __m128
sm = _mm_set1_ps(-0.
f);
559 static const __m128 _o = _mm_set1_ps(1.
e-24);
560 static const __m128 _0 = _mm_set1_ps(0.);
561 static const __m128 _1 = _mm_set1_ps(1.);
562 static const __m128 _2 = _mm_set1_ps(2.);
563 __m128 _n,_m,gI,gR,_p,_q;
566 _p = _mm_and_ps(_mm_cmpge_ps(gR,_0),_1);
567 _q = _mm_sub_ps(_1,_p);
568 _n = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gI,gI),_mm_mul_ps(gR,gR)));
569 gR = _mm_add_ps(_mm_andnot_ps(sm,gR),_mm_add_ps(_n,_o));
570 _n = _mm_add_ps(_mm_mul_ps(_2,_n),_o);
571 gI = _mm_div_ps(gI,_n);
572 _n = _mm_sqrt_ps(_mm_div_ps(gR,_n));
573 _m = _mm_and_ps(_mm_cmpge_ps(gI,_0),_1);
574 _m = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_m,_2),_1),_n);
575 *_s = _mm_add_ps(_mm_mul_ps(_q,_m),_mm_mul_ps(_p,_mm_div_ps(gI,_n)));
576 gI = _mm_andnot_ps(sm,gI);
577 *_c = _mm_add_ps(_mm_mul_ps(_p,_n),_mm_mul_ps(_q,_mm_div_ps(gI,_n)));
592 static const __m128
sm = _mm_set1_ps(-0.
f);
593 static const __m128 _0 = _mm_set1_ps(0.);
594 static const __m128 _5 = _mm_set1_ps(0.5);
595 static const __m128 _1 = _mm_set1_ps(1.);
596 static const __m128 _2 = _mm_set1_ps(2.);
597 __m128 _n,_m,_C,_S,_p,_q;
598 _r = _mm_mul_ps(_mm_add_ps(_1,_r),_mm_set1_ps(1.
e-6));
599 _m = _mm_and_ps(_mm_cmpge_ps(*_s,_0),_1);
600 _m = _mm_sub_ps(_mm_mul_ps(_m,_2),_1);
601 _p = _mm_and_ps(_mm_cmpge_ps(*_c,_0),_1);
602 _q = _mm_sub_ps(_1,_p);
603 _C = _mm_add_ps(_mm_andnot_ps(sm,*_c),_r);
604 _n = _mm_add_ps(_mm_mul_ps(*_s,*_s),_mm_mul_ps(_C,_C));
605 _n = _mm_div_ps(_1,_mm_mul_ps(_mm_sqrt_ps(_n),_2));
606 _C = _mm_sqrt_ps(_mm_add_ps(_5,_mm_mul_ps(_C,_n)));
607 _S = _mm_div_ps(_mm_mul_ps(*_s,_n),_C);
608 *_s = _mm_add_ps(_mm_mul_ps(_p,_S),_mm_mul_ps(_q,_mm_mul_ps(_C,_m)));
609 *_c = _mm_add_ps(_mm_mul_ps(_p,_C),_mm_mul_ps(_q,_mm_mul_ps(_S,_m)));
613 static inline void _sse_dpf4_ps(__m128* _Fp, __m128* _Fx, __m128* _fp, __m128* _fx) {
623 static inline void _sse_pnp4_ps(__m128* _fp, __m128* _fx, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
630 static const __m128 _o = _mm_set1_ps(1.
e-24);
631 static const __m128 _1 = _mm_set1_ps(1.0);
632 __m128 gp = _mm_div_ps(_1,_mm_add_ps(
_sse_dot4_ps(_fp,_fp),_o));
644 static inline void _sse_dsp4_ps(__m128* u, __m128* v, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
669 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0] = XSUM(x);,
670 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0]+= YSUM(x);
673 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1] = XSUM(x);,
674 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1]+= YSUM(x);
677 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2] = XSUM(x);,
678 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2]+= YSUM(x);
681 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3] = XSUM(x);,
682 _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3]+= YSUM(x);
684 return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
698 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
699 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
702 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
703 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
706 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
707 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
710 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
711 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
713 return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
727 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0] = XSUM(x);,
728 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0]+= YSUM(x);
731 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1] = XSUM(x);,
732 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1]+= YSUM(x);
735 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2] = XSUM(x);,
736 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2]+= YSUM(x);
739 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3] = XSUM(x);,
740 _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3]+= YSUM(x);
742 return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
749 __m128 _sm = _mm_set1_ps(-0.
f);
754 _pi = _mm_sub_ps(_pi,_qi);
755 _pe = _mm_sub_ps(_mm_sub_ps(_pe,_qe),_pi);
756 return _mm_add_ps(_mm_andnot_ps(_sm,_pi),_mm_andnot_ps(_sm,_pe));
773 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
774 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
777 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
778 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
781 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
782 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
785 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
786 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
788 return _mm_sub_ps(_L,_mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.
e-12))));
801 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
802 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
805 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
806 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
809 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
810 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
813 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
814 _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
816 _c = _mm_add_ps(_mm_mul_ps(_L,_L),_mm_set1_ps(1.
e-16));
817 return _mm_div_ps(_mm_load_ps(o),_c);
832 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
833 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
836 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
837 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
840 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
841 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
844 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
845 _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
849 _c = _mm_add_ps(_mm_set1_ps(1.
e-12),_mm_mul_ps(_c,_c));
850 _c = _mm_div_ps(_mm_load_ps(o),_c);
852 return _mm_mul_ps(
_sse_abs4_ps(_q),_mm_sub_ps(_mm_set1_ps(1.),_c));
855 static inline __m128
_sse_ed4_ps(__m128* _p, __m128* _q, __m128 _L) {
866 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
867 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
868 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
869 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
872 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
873 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
874 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
875 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
878 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
879 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
880 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
881 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
884 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
885 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
886 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
887 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
889 _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5));
890 return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
904 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
905 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
906 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
907 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
910 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
911 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
912 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
913 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
916 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
917 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
918 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
919 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
922 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
923 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
924 _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
925 _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
928 _aa = _mm_add_ps(_mm_set1_ps(1.
e-12),_mm_mul_ps(_aa,_aa));
931 return _mm_mul_ps(_aa,_mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5)));
945 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
946 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
947 _a++; _b++; o[0] = XSUM(x);,
948 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
949 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
950 _a++; _b++; o[0]+= YSUM(x);
953 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
954 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
955 _a++; _b++; o[1] = XSUM(x);,
956 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
957 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
958 _a++; _b++; o[1]+= YSUM(x);
961 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
962 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
963 _a++; _b++; o[2] = XSUM(x);,
964 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
965 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
966 _a++; _b++; o[2]+= YSUM(x);
969 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
970 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
971 _a++; _b++; o[3] = XSUM(x);,
972 _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
973 _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
974 _a++; _b++; o[3]+= YSUM(x);
976 _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(2.));
977 return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.
e-12)));
986 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
987 return _mm_div_ps(xx,_mm_add_ps(
_sse_dot4_ps(_f,_f),_mm_set1_ps(1.
e-12)));
990 static inline __m128
_sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
999 __m128 gp = _mm_add_ps(
_sse_dot4_ps(fp,fp),_mm_set1_ps(1.
e-12));
1001 xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));
1002 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1003 return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx));
1006 static inline __m128
_sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1014 __m128 gp = _mm_add_ps(
_sse_dot4_ps(fp,fp),_mm_set1_ps(1.
e-12));
1016 xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));
1017 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1018 return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx));
1021 static inline __m128
_sse_like4w_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1029 __m128 gp = _mm_add_ps(
_sse_dot4_ps(fp,fp),_mm_set1_ps(1.
e-9));
1030 xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));
1031 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1032 return _mm_div_ps(_mm_add_ps(xp,xx),gp);
1040 static inline __m128
_sse_reg4x_ps(__m128 _L, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
1047 static const __m128 _o = _mm_set1_ps(1.
e-12);
1051 xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));
1052 xx = _mm_div_ps(_mm_mul_ps(xx,_D),_mm_mul_ps(_L,FF));
1053 return _mm_div_ps(_mm_sub_ps(FF,xx),_mm_add_ps(FF,_D));
1064 _ll = _mm_add_ps(_mm_add_ps(_ll,_LL),_mm_set1_ps(1.
e-12));
1065 return _mm_div_ps(_mm_add_ps(_ei,_EI),_ll);
1068 static inline void _sse_pol4_ps(__m128* _fp, __m128* _fx, __m128* _v,
double*
r,
double* a) {
1076 __m128 _oo = _mm_set1_ps(1.
e-12);
1077 float rpol[4],cpol[4],spol[4];
1080 _cc = _mm_add_ps(_mm_sqrt_ps(_cc),_oo);
1082 _mm_storeu_ps(cpol,_cc);
1085 _ss = _mm_add_ps(_mm_sqrt_ps(_ss),_oo);
1087 _mm_storeu_ps(spol,_ss);
1091 for(
int n=0;n<4;n++) {
1092 r[
n] = sqrt(rpol[n]);
1093 a[
n] = atan2(spol[n],cpol[n]);
static float _sse_abs_ps(__m128 *_a)
static float _sse_dot_ps(__m128 *_a, __m128 *_b)
printf("total live time: non-zero lags = %10.1f \n", liveTot)
static void _sse_hard4_ps(__m128 *_uu, __m128 *_am, __m128 *_AM, __m128 _c)
static __m128 _sse_rnorm4_ps(__m128 *_p)
static __m128 _sse_reg4x_ps(__m128 _L, __m128 *fx, __m128 *am, __m128 *AM, __m128 _D)
wavearray< double > a(hp.size())
static void _sse_add4_ps(__m128 *_a, __m128 *_b, __m128 _c)
static __m128 _sse_abs4_ps(__m128 *_p)
static void _sse_zero_ps(__m128 *_p)
static __m128 _sse_ed4_ps(__m128 *_p, __m128 *_q, __m128 _L)
static void _sse_dsp4_ps(__m128 *u, __m128 *v, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
static __m128 _sse_ei4xu_ps(__m128 *_x, __m128 *_u, __m128 _L)
static __m128 _sse_dot4_ps(__m128 *_p, __m128 *_q)
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\t layers : "<< nLAYERS<< "\t dF(hz) : "<< dF<< "\t dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1)*itime+ifreq;double time=itime *dT;double freq=(ifreq >0)?ifreq *dF:dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
static void _sse_cpf4_ps(__m128 *_aa, __m128 *_pp)
static float _sse_rotsub_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
static __m128 _sse_nind4_ps(__m128 *_am, __m128 *_AM)
static void _sse_cpf_ps(float *a, __m128 *_p)
static void _sse_rotm_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
static void _sse_rotadd_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _sse_nrg_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
static __m128 _sse_ecoh4_ps(__m128 *_p, __m128 *_q, __m128 _L)
gwavearray< double > * gx
static void _sse_ifcp4_ps(__m128 *_aa, __m128 *_bb, __m128 _c)
static __m128 _sse_ind4_ps(__m128 *_p, __m128 _L)
static void _sse_pol4_ps(__m128 *_fp, __m128 *_fx, __m128 *_v, double *r, double *a)
static __m128 _sse_like4w_ps(__m128 *fp, __m128 *fx, __m128 *am, __m128 *AM)
static void _sse_rot4m_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
static void _sse_load_ps(__m128 *_p, float a)
static void _sse_add_ps(__m128 *_a, __m128 *_b)
static void _sse_mul4_ps(__m128 *_am, __m128 _c)
static __m128 _sse_ei4_ps(__m128 *_u, __m128 _L)
static void _sse_print_ps(__m128 *_p)
static __m128 _sse_ed4i_ps(__m128 *_p, __m128 *_q, __m128 _L)
static __m128 _sse_sum_ps(__m128 **_p)
static void _sse_point_ps(__m128 **_p, float **p, short **m, int l, int n)
static void _sse_mul_ps(__m128 *_a, float b)
static __m128 _sse_div4_ps(__m128 *_v, __m128 *_u)
static __m128 _sse_null4_ps(__m128 *_p, __m128 *_q)
static float _sse_maxE_ps(__m128 *_a, __m128 *_A)
static void _sse_minSNE_ps(__m128 *_pE, __m128 **_pe, __m128 *_es)
static __m128 _sse_like4_ps(__m128 *_f, __m128 *_a, __m128 *_A)
static void _sse_rot4p_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
static void _sse_rotp_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
static void _sse_sub4_ps(__m128 *_a, __m128 *_b, __m128 _c)
static void _sse_dpf4_ps(__m128 *_Fp, __m128 *_Fx, __m128 *_fp, __m128 *_fx)
static __m128 _sse_cut_ps(__m128 *_pE, __m128 **_pe, __m128 _Es, __m128 _cmp)
static void _sse_pnp4_ps(__m128 *_fp, __m128 *_fx, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
static __m128 _sse_ei4xx_ps(__m128 *_x, __m128 *_u, __m128 _L)
static void _sse_ort4_ps(__m128 *_u, __m128 *_v, __m128 *_s, __m128 *_c)
static void _sse_sub_ps(__m128 *_a, __m128 *_b)