Logo Coherent WaveBurst  
Reference Guide
Logo
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
watsse.hh
Go to the documentation of this file.
1 // Wavelet Analysis Tool
2 // S.Klimenko, University of Florida
3 // library of general functions
4 
5 #ifndef WATSSE_HH
6 #define WATSSE_HH
7 
8 #include <xmmintrin.h>
9 #include <pmmintrin.h>
10 #include <iostream>
11 #include <stdlib.h>
12 #include <math.h>
13 #include <TMath.h>
14 #include "wat.hh"
15 
16 // WAT SSE functions
17 
18 
19 static inline void _sse_print_ps(__m128* _p) {
20  float x[4];
21  _NET(_mm_storeu_ps(x,_p[0]); printf("%e %e %e %e ",x[0],x[1],x[2],x[3]);,
22  _mm_storeu_ps(x,_p[1]); printf("%e %e %e %e ",x[0],x[1],x[2],x[3]);)
23  cout<<endl;
24 }
25 
26 static inline void _sse_zero_ps(__m128* _p) {
27  _NET(_p[0] = _mm_setzero_ps();,
28  _p[1] = _mm_setzero_ps();)
29  return;
30 }
31 
32 static inline void _sse_load_ps(__m128* _p, float a) {
33  _NET(_p[0] = _mm_load1_ps(&a);,
34  _p[1] = _mm_load1_ps(&a);)
35  return;
36 }
37 
38 static inline void _sse_mul_ps(__m128* _a, float b) {
39  __m128 _b = _mm_load1_ps(&b);
40  _NET(_a[0] = _mm_mul_ps(_a[0],_b);,
41  _a[1] = _mm_mul_ps(_a[1],_b);)
42 }
43 
44 static inline void _sse_mul_ps(__m128* _a, __m128* _b) {
45  _NET(_a[0] = _mm_mul_ps(_a[0],_b[0]);,
46  _a[1] = _mm_mul_ps(_a[1],_b[1]);)
47 }
48 
49 static inline float _sse_mul_ps(__m128* _a, __m128* _b, __m128* _o) {
50  float x[4];
51  float out;
52  _NET(_o[0] = _mm_mul_ps(_a[0],_b[0]);_mm_storeu_ps(x,_o[0]); out = XSUM(x);,
53  _o[1] = _mm_mul_ps(_a[1],_b[1]);_mm_storeu_ps(x,_o[1]); out+= YSUM(x);)
54  return out;
55 }
56 
57 static inline void _sse_mul4_ps(__m128* _am, __m128 _c) {
58  float c[4]; _mm_storeu_ps(c,_c);
59  __m128* _a = _am;
60  _NET(_c=_mm_load1_ps( c );*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
61  _NET(_c=_mm_load1_ps(c+1);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
62  _NET(_c=_mm_load1_ps(c+2);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
63  _NET(_c=_mm_load1_ps(c+3);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)
64 }
65 
66 static inline void _sse_hard4_ps(__m128* _uu, __m128* _am, __m128* _AM, __m128 _c) {
67 // calculate hard responses
68 // replace standard with hard responses if _c=0;
69 // uu - unity vector along f+
70 // am, AM - 00 and 90 phase responses.
71  float c[4]; _mm_storeu_ps(c,_c);
72  __m128* _u = _uu;
73  __m128* _a = _am;
74  __m128* _A = _AM;
75  __m128 _r, _R;
76  _NET(
77  _r = _mm_set1_ps(c[0]); _R = _mm_set1_ps(1-c[0]);
78  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
79  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
80  )
81  _NET(
82  _r = _mm_set1_ps(c[1]); _R = _mm_set1_ps(1-c[1]);
83  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
84  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
85  )
86  _NET(
87  _r = _mm_set1_ps(c[2]); _R = _mm_set1_ps(1-c[2]);
88  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
89  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
90  )
91  _NET(
92  _r = _mm_set1_ps(c[3]); _R = _mm_set1_ps(1-c[3]);
93  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,
94  *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;
95  )
96 }
97 
98 static inline void _sse_ifcp4_ps(__m128* _aa, __m128* _bb, __m128 _c) {
99 // sabstutute vector _aa with _bb if _c=0;
100  float c[4]; _mm_storeu_ps(c,_c);
101  __m128* _a = _aa;
102  __m128* _b = _bb;
103  __m128 _1, _0;
104  _NET(_1 = _mm_set1_ps(c[0]); _0 = _mm_set1_ps(1-c[0]);
105  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
106  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
107  _NET(_1 = _mm_set1_ps(c[1]); _0 = _mm_set1_ps(1-c[1]);
108  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
109  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
110  _NET(_1 = _mm_set1_ps(c[2]); _0 = _mm_set1_ps(1-c[2]);
111  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
112  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
113  _NET(_1 = _mm_set1_ps(c[3]); _0 = _mm_set1_ps(1-c[3]);
114  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,
115  *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)
116 }
117 
118 
119 static inline float _sse_abs_ps(__m128* _a) {
120  float x[4];
121  float out;
122  _NET(
123  _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
124  _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
125  )
126  return out;
127 }
128 
129 static inline float _sse_abs_ps(__m128* _a, __m128* _A) {
130  float x[4];
131  float out;
132  _NET(
133  _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]))); out = XSUM(x);,
134  _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]))); out+= YSUM(x);
135  )
136  return out;
137 }
138 
139 static inline __m128 _sse_abs4_ps(__m128* _p) {
140 // return |p|^2
141  float x[4];
142  float o[4];
143  __m128* _a = _p;
144  _NET(
145  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x);,
146  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
147  )
148  _NET(
149  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x);,
150  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
151  )
152  _NET(
153  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x);,
154  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
155  )
156  _NET(
157  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x);,
158  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
159  )
160  return _mm_load_ps(o);
161 }
162 
163 static inline __m128 _sse_div4_ps(__m128* _v, __m128* _u) {
164 // returns |v|/|u|
165  return _mm_sqrt_ps(_mm_div_ps(_sse_abs4_ps(_v),
166  _mm_add_ps(_sse_abs4_ps(_u),
167  _mm_set1_ps(1.e-24))));
168 }
169 
170 
171 static inline __m128 _sse_rnorm4_ps(__m128* _p) {
172 // return reciprocical norm: 1/|p|
173  float x[4];
174  float o[4];
175  __m128* _a = _p;
176  _NET(
177  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x)+1.e-24;,
178  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);
179  )
180  _NET(
181  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x)+1.e-24;,
182  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);
183  )
184  _NET(
185  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x)+1.e-24;,
186  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);
187  )
188  _NET(
189  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x)+1.e-24;,
190  _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);
191  )
192  return _mm_div_ps(_mm_set1_ps(1.),_mm_sqrt_ps(_mm_load_ps(o)));
193 }
194 
195 static inline float _sse_dot_ps(__m128* _a, __m128* _b) {
196  float x[4];
197  float out;
198  _NET(
199  _mm_storeu_ps(x,_mm_mul_ps(_a[0],_b[0])); out = XSUM(x);,
200  _mm_storeu_ps(x,_mm_mul_ps(_a[1],_b[1])); out+= YSUM(x);
201  )
202  return out;
203 }
204 
205 static inline __m128 _sse_dot4_ps(__m128* _p, __m128* _q) {
206  float x[4];
207  float o[4];
208  __m128* _o = (__m128*) o;
209  __m128* _a = _p;
210  __m128* _b = _q;
211  _NET(
212  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0] = XSUM(x);,
213  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0]+= YSUM(x);
214  )
215  _NET(
216  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1] = XSUM(x);,
217  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1]+= YSUM(x);
218  )
219  _NET(
220  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2] = XSUM(x);,
221  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2]+= YSUM(x);
222  )
223  _NET(
224  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3] = XSUM(x);,
225  _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3]+= YSUM(x);
226  )
227  return *_o;
228 }
229 
230 static inline void _sse_add_ps(__m128* _a, __m128* _b) {
231 // _a += _b
232  _NET(_a[0] = _mm_add_ps(_a[0],_b[0]);,
233  _a[1] = _mm_add_ps(_a[1],_b[1]);)
234  return;
235 }
236 
237 static inline void _sse_add_ps(__m128* _a, __m128* _b, __m128 _c) {
238 // _a += _b *_c
239  _NET(_a[0] = _mm_add_ps(_a[0],_mm_mul_ps(_b[0],_c));,
240  _a[1] = _mm_add_ps(_a[1],_mm_mul_ps(_b[1],_c));)
241  return;
242 }
243 
244 static inline void _sse_add4_ps(__m128* _a, __m128* _b, __m128 _c) {
245 // _a++ += _b++ *c[0]
246 // _a++ += _b++ *c[1]
247 // _a++ += _b++ *c[2]
248 // _a++ += _b++ *c[3]
249  float c[4]; _mm_storeu_ps(c,_c);
250  __m128* _p = _a;
251  __m128* _q = _b;
252  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
253  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
254  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
255  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
256  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
257  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
258  _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
259  *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
260  return;
261 }
262 
263 static inline void _sse_sub_ps(__m128* _a, __m128* _b) {
264 // _a -= _b
265  _NET(_a[0] = _mm_sub_ps(_a[0],_b[0]);,
266  _a[1] = _mm_sub_ps(_a[1],_b[1]);)
267  return;
268 }
269 
270 static inline void _sse_sub4_ps(__m128* _a, __m128* _b, __m128 _c) {
271 // _a++ -= _b++ *c[0]
272 // _a++ -= _b++ *c[1]
273 // _a++ -= _b++ *c[2]
274 // _a++ -= _b++ *c[3]
275  float c[4]; _mm_storeu_ps(c,_c);
276  __m128* _p = _a;
277  __m128* _q = _b;
278  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,
279  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)
280  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,
281  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)
282  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,
283  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)
284  _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,
285  *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)
286  return;
287 }
288 
289 static inline void _sse_cpf_ps(float* a, __m128* _p) {
290  _NET(_mm_storeu_ps(a,*_p);,
291  _mm_storeu_ps(a+4,*(_p+1));)
292  return;
293 }
294 
295 static inline void _sse_cpf_ps(__m128* _a, __m128* _p) {
296  _NET(*_a = *_p;, *(_a+1) = *(_p+1);)
297 }
298 
299 static inline void _sse_cpf4_ps(__m128* _aa, __m128* _pp) {
300  __m128* _a = _aa;
301  __m128* _p = _pp;
302  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
303  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
304  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
305  _NET(*_a++ = *_p++;, *_a++ = *_p++;)
306 }
307 
308 
309 static inline void _sse_cpf_ps(float* a, __m128* _p, float b) {
310  __m128 _b = _mm_load1_ps(&b);
311  _NET(_mm_storeu_ps(a,_mm_mul_ps(*_p,_b));,
312  _mm_storeu_ps(a+4,_mm_mul_ps(*(_p+1),_b));)
313  return;
314 }
315 
316 static inline void _sse_cpf4_ps(float* aa, __m128* _pp) {
317 // copy data from pp to aa for 4 consecutive pixels
318  float* a = aa;
319  __m128* _p = _pp;
320 
321  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
322  _mm_storeu_ps(a,*_p++); a+=4;)
323  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
324  _mm_storeu_ps(a,*_p++); a+=4;)
325  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
326  _mm_storeu_ps(a,*_p++); a+=4;)
327  _NET(_mm_storeu_ps(a,*_p++); a+=4;,
328  _mm_storeu_ps(a,*_p++); a+=4;)
329 
330  return;
331 }
332 
333 static inline void _sse_cpf4_ps(__m128* _aa, __m128* _pp, __m128 _c) {
334 // multiply p by c and copy data from p to a for 4 consecutive pixels
335 // calculate _a[0]=_p[0]*c[0]
336 // calculate _a[1]=_p[1]*c[1]
337 // calculate _a[2]=_p[2]*c[2]
338 // calculate _a[3]=_p[3]*c[3]
339  float c[4]; _mm_storeu_ps(c,_c);
340  __m128* _a = _aa;
341  __m128* _p = _pp;
342 
343  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));,
344  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));)
345  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));,
346  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));)
347  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));,
348  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));)
349  _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));,
350  *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));)
351  return;
352 }
353 
354 static inline float _sse_nrg_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {
355 // calculate b = (a - u*c - v*s) and return b*b
356  float x[4];
357  float out;
358  __m128 _b;
359  __m128 _c = _mm_load1_ps(&c);
360  __m128 _s = _mm_load1_ps(&s);
361 
362  _NET(_b = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(*_u,_c), _mm_mul_ps(*_v,_s)));
363  _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out = XSUM(x);,
364  _b = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(*(_u+1),_c), _mm_mul_ps(*(_v+1), _s)));
365  _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out+= YSUM(x);)
366 
367  return out/2.;
368 }
369 
370 static inline void _sse_rotadd_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {
371 // calculate a += u*c + v*s
372  __m128 _c = _mm_load1_ps(&c);
373  __m128 _s = _mm_load1_ps(&s);
374  _NET(
375  _a[0] = _mm_add_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));,
376  _a[1] = _mm_add_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1],_s)));
377  )
378  return;
379 }
380 
381 static inline float _sse_rotsub_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {
382 // calculate a -= u*c + v*s and return a*a
383  float x[4];
384  float out;
385  __m128 _c = _mm_load1_ps(&c);
386  __m128 _s = _mm_load1_ps(&s);
387 
388  _NET(
389  _a[0] = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));
390  _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,
391  _a[1] = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1], _s)));
392  _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);
393  )
394 
395  return out;
396 }
397 
398 static inline void _sse_rotp_ps(__m128* u, float* c, __m128* v, float* s, __m128* a) {
399 // calculate a = u*c + v*s
400  _NET(
401  a[0] = _mm_add_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
402  a[1] = _mm_add_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
403  )
404 }
405 
406 static inline void _sse_rotm_ps(__m128* u, float* c, __m128* v, float* s, __m128* a) {
407 // calculate a = u*c - v*s
408  _NET(
409  a[0] = _mm_sub_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,
410  a[1] = _mm_sub_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));
411  )
412 }
413 
414 static inline __m128 _sse_rotp_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
415 // return u*c + v*s
416  return _mm_add_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
417 }
418 
419 static inline __m128 _sse_rotm_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {
420 // return u*c - v*s
421  return _mm_sub_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));
422 }
423 
424 static inline void _sse_rot4p_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
425 // calculate a[0] = u[0]*c[0] + v[0]*s[0]
426 // calculate a[1] = u[1]*c[1] + v[1]*s[1]
427 // calculate a[2] = u[2]*c[2] + v[2]*s[2]
428 // calculate a[3] = u[3]*c[3] + v[3]*s[3]
429  float c[4];
430  float s[4];
431  _mm_storeu_ps(c,*_c);
432  _mm_storeu_ps(s,*_s);
433  __m128* u = _u;
434  __m128* v = _v;
435  __m128* a = _a;
436  _NET(
437  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
438  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
439  )
440  _NET(
441  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
442  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
443  )
444  _NET(
445  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
446  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
447  )
448  _NET(
449  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
450  *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
451  )
452 }
453 
454 static inline void _sse_rot4m_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {
455 // calculate a[0] = u[0]*c[0] - v[0]*s[0]
456 // calculate a[1] = u[1]*c[1] - v[1]*s[1]
457 // calculate a[2] = u[2]*c[2] - v[2]*s[2]
458 // calculate a[3] = u[3]*c[3] - v[3]*s[3]
459  float c[4];
460  float s[4];
461  _mm_storeu_ps(c,*_c);
462  _mm_storeu_ps(s,*_s);
463  __m128* u = _u;
464  __m128* v = _v;
465  __m128* a = _a;
466  _NET(
467  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,
468  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));
469  )
470  _NET(
471  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,
472  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));
473  )
474  _NET(
475  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,
476  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));
477  )
478  _NET(
479  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,
480  *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));
481  )
482 }
483 
484 static inline void _sse_point_ps(__m128** _p, float** p, short** m, int l, int n) {
485 // point 0-7 __m128 pointers to first network pixel
486  NETX(_p[0] = (__m128*) (p[0] + m[0][l]*n);,
487  _p[1] = (__m128*) (p[1] + m[1][l]*n);,
488  _p[2] = (__m128*) (p[2] + m[2][l]*n);,
489  _p[3] = (__m128*) (p[3] + m[3][l]*n);,
490  _p[4] = (__m128*) (p[4] + m[4][l]*n);,
491  _p[5] = (__m128*) (p[5] + m[5][l]*n);,
492  _p[6] = (__m128*) (p[6] + m[6][l]*n);,
493  _p[7] = (__m128*) (p[7] + m[7][l]*n);)
494  return;
495 }
496 
497 static inline __m128 _sse_sum_ps(__m128** _p) {
498  __m128 _q = _mm_setzero_ps();
499  NETX(_q = _mm_add_ps(_q, *_p[0]);,
500  _q = _mm_add_ps(_q, *_p[1]);,
501  _q = _mm_add_ps(_q, *_p[2]);,
502  _q = _mm_add_ps(_q, *_p[3]);,
503  _q = _mm_add_ps(_q, *_p[4]);,
504  _q = _mm_add_ps(_q, *_p[5]);,
505  _q = _mm_add_ps(_q, *_p[6]);,
506  _q = _mm_add_ps(_q, *_p[7]);)
507  return _q;
508 }
509 
510 static inline __m128 _sse_cut_ps(__m128* _pE, __m128** _pe, __m128 _Es, __m128 _cmp) {
511  NETX(_cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[0]++),_Es));,
512  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[1]++),_Es));,
513  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[2]++),_Es));,
514  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[3]++),_Es));,
515  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[4]++),_Es));,
516  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[5]++),_Es));,
517  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[6]++),_Es));,
518  _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[7]++),_Es));)
519  return _cmp;
520 }
521 
522 static inline void _sse_minSNE_ps(__m128* _pE, __m128** _pe, __m128* _es) {
523 // put pixel minimum subnetwork energy in _es
524 // input _es should be initialized to _pE before call
525  NETX(*_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[0]++));,
526  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[1]++));,
527  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[2]++));,
528  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[3]++));,
529  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[4]++));,
530  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[5]++));,
531  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[6]++));,
532  *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[7]++));
533  )
534 }
535 
536 static inline float _sse_maxE_ps(__m128* _a, __m128* _A) {
537 // given input 00 and 90 data vectors
538 // returns energy of dominant detector (max energy)
539  float out;
540  float x[4];
541  __m128 _o1;
542  __m128 _o2 = _mm_setzero_ps();
543  _NET(
544  _o1 = _mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]));,
545  _o2 = _mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]));
546  )
547  _o1 = _mm_max_ps(_o1,_o2); _mm_storeu_ps(x,_o1); out=x[0];
548  if(out<x[1]) out=x[1];
549  if(out<x[2]) out=x[2];
550  if(out<x[3]) out=x[3];
551  return out;
552 }
553 
554 static inline void _sse_ort4_ps(__m128* _u, __m128* _v, __m128* _s, __m128* _c) {
555 // orthogonalize vectors _u and _v: take vectors u and v,
556 // make them orthogonal, calculate rotation phase
557 // fill in sin and cos in _s and _c respectively
558  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
559  static const __m128 _o = _mm_set1_ps(1.e-24);
560  static const __m128 _0 = _mm_set1_ps(0.);
561  static const __m128 _1 = _mm_set1_ps(1.);
562  static const __m128 _2 = _mm_set1_ps(2.);
563  __m128 _n,_m,gI,gR,_p,_q;
564  gI = _mm_mul_ps(_sse_dot4_ps(_u,_v),_2); // ~sin(2*psi) or 2*u*v
565  gR = _mm_sub_ps(_sse_dot4_ps(_u,_u),_sse_dot4_ps(_v,_v)); // u^2-v^2
566  _p = _mm_and_ps(_mm_cmpge_ps(gR,_0),_1); // 1 if gR>0. or 0 if gR<0.
567  _q = _mm_sub_ps(_1,_p); // 0 if gR>0. or 1 if gR<0.
568  _n = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gI,gI),_mm_mul_ps(gR,gR))); // gc
569  gR = _mm_add_ps(_mm_andnot_ps(sm,gR),_mm_add_ps(_n,_o)); // gc+|gR|+eps
570  _n = _mm_add_ps(_mm_mul_ps(_2,_n),_o); // 2*gc + eps
571  gI = _mm_div_ps(gI,_n); // sin(2*psi)
572  _n = _mm_sqrt_ps(_mm_div_ps(gR,_n)); // sqrt((gc+|gR|)/(2gc+eps))
573  _m = _mm_and_ps(_mm_cmpge_ps(gI,_0),_1); // 1 if gI>0. or 0 if gI<0.
574  _m = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_m,_2),_1),_n); // _n if gI>0 or -_n if gI<0
575  *_s = _mm_add_ps(_mm_mul_ps(_q,_m),_mm_mul_ps(_p,_mm_div_ps(gI,_n))); // sin(psi)
576  gI = _mm_andnot_ps(sm,gI); // |gI|
577  *_c = _mm_add_ps(_mm_mul_ps(_p,_n),_mm_mul_ps(_q,_mm_div_ps(gI,_n))); // cos(psi)
578  return;
579 }
580 
581 static inline void _sse_ort4_ps(__m128* _s, __m128* _c, __m128 _r) {
582 // solves equation sin(2a) * _c - cos(2a) * _s = 0
583 // input: _s ~ sin(2a), _c ~ cos(2a) (not normalized), _r - regulator
584 // regularized _C = |_c| + _r*eps
585 // norm: _n = sqrt(_s*_s+_C*_C)
586 // cos(2a) = _C/_n
587 // sin(2a) = _s/_n
588 // output _s = sin(a), _c = cos(a)
589 // algorithm:
590 // _c>0: cos(a) = sqrt([1+cos(2a)]/2); sin(a) = sin(2a)/2/cos(a)
591 // _c<0: sin(a) = sign[_s]*sqrt([1+cos(2a)]/2); cos(a) = |sin(2a)|/2/|sin(a)|
592  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
593  static const __m128 _0 = _mm_set1_ps(0.);
594  static const __m128 _5 = _mm_set1_ps(0.5);
595  static const __m128 _1 = _mm_set1_ps(1.);
596  static const __m128 _2 = _mm_set1_ps(2.);
597  __m128 _n,_m,_C,_S,_p,_q;
598  _r = _mm_mul_ps(_mm_add_ps(_1,_r),_mm_set1_ps(1.e-6)); // regulator
599  _m = _mm_and_ps(_mm_cmpge_ps(*_s,_0),_1); // 1 if _S>0. or 0 if _S<0.
600  _m = _mm_sub_ps(_mm_mul_ps(_m,_2),_1); // sign(_s)
601  _p = _mm_and_ps(_mm_cmpge_ps(*_c,_0),_1); // 1 if _c>0. or 0 if _c<0.
602  _q = _mm_sub_ps(_1,_p); // 0 if _c>0. or 1 if _c<0.
603  _C = _mm_add_ps(_mm_andnot_ps(sm,*_c),_r); // |_c|+regulator
604  _n = _mm_add_ps(_mm_mul_ps(*_s,*_s),_mm_mul_ps(_C,_C)); // norm^2 for sin(2a) and cos(2a)
605  _n = _mm_div_ps(_1,_mm_mul_ps(_mm_sqrt_ps(_n),_2)); // 0.5/norm for sin(2a) and cos(2a)
606  _C = _mm_sqrt_ps(_mm_add_ps(_5,_mm_mul_ps(_C,_n))); // |cos(a)|
607  _S = _mm_div_ps(_mm_mul_ps(*_s,_n),_C); // sin(a)*cos(a)/|cos(a)|
608  *_s = _mm_add_ps(_mm_mul_ps(_p,_S),_mm_mul_ps(_q,_mm_mul_ps(_C,_m))); // sin(psi)
609  *_c = _mm_add_ps(_mm_mul_ps(_p,_C),_mm_mul_ps(_q,_mm_mul_ps(_S,_m))); // cos(psi)
610  return;
611 }
612 
613 static inline void _sse_dpf4_ps(__m128* _Fp, __m128* _Fx, __m128* _fp, __m128* _fx) {
614 // transformation to DPF for 4 consecutive pixels.
615 // rotate vectors Fp and Fx into DPF: fp and fx
616  __m128 _c, _s;
617  _sse_ort4_ps(_Fp,_Fx,&_s,&_c); // get sin and cos
618  _sse_rot4p_ps(_Fp,&_c,_Fx,&_s,_fp); // get fp=Fp*c+Fx*s
619  _sse_rot4m_ps(_Fx,&_c,_Fp,&_s,_fx); // get fx=Fx*c-Fp*s
620  return;
621 }
622 
623 static inline void _sse_pnp4_ps(__m128* _fp, __m128* _fx, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
624 // projection to network plane (pnp)
625 // _fp and _fx must be in DPF
626 // project vectors _am and _AM on the network plane _fp,_fx
627 // returns square of the network alignment factor
628 // c = xp/gp - cos of rotation to PCF
629 // s = xx/gx - sin of rotation to PCF
630  static const __m128 _o = _mm_set1_ps(1.e-24);
631  static const __m128 _1 = _mm_set1_ps(1.0);
632  __m128 gp = _mm_div_ps(_1,_mm_add_ps(_sse_dot4_ps(_fp,_fp),_o)); // 1/fp*fp
633  _sse_sub4_ps(_fx,_fp,_mm_mul_ps(gp,_sse_dot4_ps(_fp,_fx))); // check fx _|_ to fp
634  __m128 gx = _mm_div_ps(_1,_mm_add_ps(_sse_dot4_ps(_fx,_fx),_o)); // 1/fx*fx
635  __m128 cc = _mm_mul_ps(_sse_dot4_ps(_fp,_am),gp); // cos
636  __m128 ss = _mm_mul_ps(_sse_dot4_ps(_fx,_am),gx); // sin
637  _sse_rot4p_ps(_fp,&cc,_fx,&ss,_u); // get vector u
638  cc = _mm_mul_ps(_sse_dot4_ps(_fp,_AM),gp); // cos
639  ss = _mm_mul_ps(_sse_dot4_ps(_fx,_AM),gx); // sin
640  _sse_rot4p_ps(_fp,&cc,_fx,&ss,_v); // get vector v
641  return;
642 }
643 
644 static inline void _sse_dsp4_ps(__m128* u, __m128* v, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {
645 // dual stream phase (dsp) transformation
646 // take projection vectors uu and vu,
647 // make them orthogonal, calculate dual stream phase
648 // apply phase transformation both to data and projections
649  __m128 _c, _s;
650  _sse_ort4_ps(u,v,&_s,&_c); // get sin and cos
651  _sse_rot4p_ps(u,&_c,v,&_s,_u); // get 00 response
652  _sse_rot4m_ps(v,&_c,u,&_s,_v); // get 90 response
653  _sse_rot4p_ps(_am,&_c,_AM,&_s,u); // get 00 data vector
654  _sse_rot4m_ps(_AM,&_c,_am,&_s,v); // get 90 data vector
655  _sse_cpf4_ps(_am,u);
656  _sse_cpf4_ps(_AM,v);
657  return;
658 }
659 
660 static inline __m128 _sse_ei4_ps(__m128* _u, __m128 _L) {
661 // returns incoherent energy for vector u
662 // calculates sum: u[k]*u[k]*u[k]*u[k]/L
663 // where L should be |u|^2
664  float x[4];
665  float o[4];
666  __m128* _a = _u;
667  __m128 _c;
668  _NET(
669  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0] = XSUM(x);,
670  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0]+= YSUM(x);
671  )
672  _NET(
673  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1] = XSUM(x);,
674  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1]+= YSUM(x);
675  )
676  _NET(
677  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2] = XSUM(x);,
678  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2]+= YSUM(x);
679  )
680  _NET(
681  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3] = XSUM(x);,
682  _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3]+= YSUM(x);
683  )
684  return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
685 }
686 
687 static inline __m128 _sse_ei4xx_ps(__m128* _x, __m128* _u, __m128 _L) {
688 // returns incoherent energy for vectors p,q
689 // _x - data vector
690 // _u - projection vector on the network plane
691 // calculates sum: x[k]*x[k]*u[k]*u[k]/L
692  float x[4];
693  float o[4];
694  __m128* _a = _x;
695  __m128* _b = _u;
696  __m128 _c;
697  _NET(
698  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
699  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
700  )
701  _NET(
702  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
703  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
704  )
705  _NET(
706  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
707  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
708  )
709  _NET(
710  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
711  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
712  )
713  return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
714 }
715 
716 static inline __m128 _sse_ei4xu_ps(__m128* _x, __m128* _u, __m128 _L) {
717 // returns incoherent energy for vectors p,q
718 // _x - data vector
719 // _u - projection vector on the network plane
720 // calculates sum: x[k]*u[k]*u[k]*u[k]/L
721  float x[4];
722  float o[4];
723  __m128* _a = _x;
724  __m128* _b = _u;
725  __m128 _c;
726  _NET(
727  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0] = XSUM(x);,
728  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0]+= YSUM(x);
729  )
730  _NET(
731  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1] = XSUM(x);,
732  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1]+= YSUM(x);
733  )
734  _NET(
735  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2] = XSUM(x);,
736  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2]+= YSUM(x);
737  )
738  _NET(
739  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3] = XSUM(x);,
740  _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3]+= YSUM(x);
741  )
742  return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
743 }
744 
745 static inline __m128 _sse_null4_ps(__m128* _p, __m128* _q) {
746 // returns global NULL energy: |E-L-Ei+Li| + |Ei-Li|
747 // E = |p|^2; Ei = sum{p[k]^4}/|p|^2
748 // L = |q|^2; Li = sum{q[k]^4}/|q|^2
749  __m128 _sm = _mm_set1_ps(-0.f); // sign mask: -0.f = 1 << 31
750  __m128 _pe = _sse_abs4_ps(_p); // get total energy for _p
751  __m128 _pi = _sse_ei4_ps(_p,_pe); // get incoherent energy for _p
752  __m128 _qe = _sse_abs4_ps(_q); // get total energy for _q
753  __m128 _qi = _sse_ei4_ps(_q,_qe); // get incoherent energy for _q
754  _pi = _mm_sub_ps(_pi,_qi); // Ei-Li
755  _pe = _mm_sub_ps(_mm_sub_ps(_pe,_qe),_pi); // (E-L) - (Ei-Li)
756  return _mm_add_ps(_mm_andnot_ps(_sm,_pi),_mm_andnot_ps(_sm,_pe));
757 }
758 
759 
760 static inline __m128 _sse_ecoh4_ps(__m128* _p, __m128* _q, __m128 _L) {
761 // returns coherent energy given
762 // _p - data vector
763 // _q - network response
764 // calculates vector: u[k] = p[k]*q[k]
765 // returns: L - u[k]*u[k]/L,
766 // where L should be (q,q)^2
767  float x[4];
768  float o[4];
769  __m128* _a = _p;
770  __m128* _b = _q;
771  __m128 _c;
772  _NET(
773  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
774  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
775  )
776  _NET(
777  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
778  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
779  )
780  _NET(
781  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
782  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
783  )
784  _NET(
785  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
786  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
787  )
788  return _mm_sub_ps(_L,_mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12))));
789 }
790 
791 static inline __m128 _sse_ind4_ps(__m128* _p, __m128 _L) {
792 // returns vector index
793 // _p - data vector and L is its norm^2
794 // calculates vector: u[k] = p[k]*p[k]
795 // returns: u[k]*u[k]/L/L,
796  float x[4];
797  float o[4];
798  __m128* _a = _p;
799  __m128 _c;
800  _NET(
801  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
802  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
803  )
804  _NET(
805  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
806  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
807  )
808  _NET(
809  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
810  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
811  )
812  _NET(
813  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
814  _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
815  )
816  _c = _mm_add_ps(_mm_mul_ps(_L,_L),_mm_set1_ps(1.e-16));
817  return _mm_div_ps(_mm_load_ps(o),_c);
818 }
819 
820 static inline __m128 _sse_ecoh4_ps(__m128* _p, __m128* _q) {
821 // returns coherent part of (p,q)^2
822 // calculates reduced unity vector: u[k] = q * |q| /(p,q)
823 // returns: (p,u)^2 - sum_k{p[k]*p[k]*u[k]*u[k]}
824 // = |q|^2 - p[k]^2 * q[k]^2 * |q|^2/(p,q)^2
825 // = |q|^2 (1 - sum_k{p[k]^2*q[k]^2}/(p,q)^2
826  float x[4];
827  float o[4];
828  __m128* _a = _p;
829  __m128* _b = _q;
830  __m128 _c;
831  _NET(
832  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,
833  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);
834  )
835  _NET(
836  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,
837  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);
838  )
839  _NET(
840  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,
841  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);
842  )
843  _NET(
844  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,
845  _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);
846  )
847 
848  _c = _sse_dot4_ps(_p,_q); // scalar product
849  _c = _mm_add_ps(_mm_set1_ps(1.e-12),_mm_mul_ps(_c,_c)); // (p,q)^2+1.e-12
850  _c = _mm_div_ps(_mm_load_ps(o),_c); // get incoherent part
851 
852  return _mm_mul_ps(_sse_abs4_ps(_q),_mm_sub_ps(_mm_set1_ps(1.),_c));
853 }
854 
855 static inline __m128 _sse_ed4_ps(__m128* _p, __m128* _q, __m128 _L) {
856 // returns energy disbalance
857 // _p - data vector
858 // _q - network response
859 // calculates sum: 0.5*(p[k]*q[k]-q[k]*q[k])^2 / L
860  float x[4];
861  float o[4];
862  __m128* _a = _p;
863  __m128* _b = _q;
864  __m128 _aa;
865  _NET(
866  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
867  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
868  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
869  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
870  )
871  _NET(
872  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
873  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
874  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
875  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
876  )
877  _NET(
878  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
879  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
880  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
881  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
882  )
883  _NET(
884  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
885  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
886  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
887  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
888  )
889  _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5));
890  return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
891 }
892 
893 static inline __m128 _sse_ed4_ps(__m128* _p, __m128* _q) {
894 // returns energy disbalance
895 // _p - data vector
896 // _q - network response (may not be a projection)
897 // calculates sum: 0.5*sum_k{(p[k]*q[k]-q[k]*q[k])^2} * (q,q)/(p,q)^2
898  float x[4];
899  float o[4];
900  __m128* _a = _p;
901  __m128* _b = _q;
902  __m128 _aa;
903  _NET(
904  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
905  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,
906  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
907  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);
908  )
909  _NET(
910  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
911  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,
912  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
913  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);
914  )
915  _NET(
916  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
917  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,
918  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
919  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);
920  )
921  _NET(
922  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
923  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,
924  _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));
925  _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);
926  )
927  _aa = _sse_dot4_ps(_p,_q); // scalar product
928  _aa = _mm_add_ps(_mm_set1_ps(1.e-12),_mm_mul_ps(_aa,_aa)); // (p,q)^2+1.e-12
929  _aa = _mm_div_ps(_sse_abs4_ps(_q),_aa); // (q,q)/(p,q)^2
930 
931  return _mm_mul_ps(_aa,_mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5)));
932 }
933 
934 static inline __m128 _sse_ed4i_ps(__m128* _p, __m128* _q, __m128 _L) {
935 // returns incoherent part of energy disbalance
936 // _p - projection amplitude on the network
937 // calculates vector: v[k] = p[k]*p[k] * (q[k]*q[k]-p[k]*p[k])
938 // returns: Sum_k{v[k]/L}
939  float x[4];
940  float o[4];
941  __m128* _a = _p;
942  __m128* _b = _q;
943  __m128 _aa,_bb;
944  _NET(
945  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
946  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
947  _a++; _b++; o[0] = XSUM(x);,
948  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
949  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
950  _a++; _b++; o[0]+= YSUM(x);
951  )
952  _NET(
953  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
954  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
955  _a++; _b++; o[1] = XSUM(x);,
956  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
957  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
958  _a++; _b++; o[1]+= YSUM(x);
959  )
960  _NET(
961  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
962  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
963  _a++; _b++; o[2] = XSUM(x);,
964  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
965  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
966  _a++; _b++; o[2]+= YSUM(x);
967  )
968  _NET(
969  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
970  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
971  _a++; _b++; o[3] = XSUM(x);,
972  _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);
973  _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));
974  _a++; _b++; o[3]+= YSUM(x);
975  )
976  _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(2.));
977  return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.e-12)));
978 }
979 
980 static inline __m128 _sse_like4_ps(__m128* _f, __m128* _a, __m128* _A) {
981 // input ff - antenna pattern (f+ or fx) in DPF
982 // input am,AM - network amplitude vectors
983 // returns: (xp*xp+XP*XP)/|f+|^2 or (xx*xx+XX*XX)/|fx|^2
984  __m128 xx = _sse_dot4_ps(_f,_a); // fp*am
985  __m128 XX = _sse_dot4_ps(_f,_A); // fp*AM
986  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
987  return _mm_div_ps(xx,_mm_add_ps(_sse_dot4_ps(_f,_f),_mm_set1_ps(1.e-12)));
988 }
989 
990 static inline __m128 _sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
991 // input fp,fx - antenna patterns in DPF
992 // input am,AM - network amplitude vectors
993 // input _D - (delta) - hard regulator
994 // returns: (xp*xp+XP*XP)/|f+|^2+(xx*xx+XX*XX)/(|fx|^2+delta)
995  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
996  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
997  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
998  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
999  __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-12)); // fx*fx + epsilon
1000  __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_D); // fx*fx + delta
1001  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1002  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1003  return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx)); // regularized projected energy
1004 }
1005 
1006 static inline __m128 _sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1007 // input fp,fx - antenna patterns in DPF
1008 // input am,AM - network amplitude vectors
1009 // returns: (xp*xp+XP*XP)/|f+|^2+(xx*xx+XX*XX)/(|fx|^2)
1010  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1011  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1012  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1013  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1014  __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-12)); // fx*fx + epsilon
1015  __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_mm_set1_ps(1.e-12)); // fx*fx + epsilon
1016  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1017  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1018  return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx)); // regularized projected energy
1019 }
1020 
1021 static inline __m128 _sse_like4w_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {
1022 // input fp,fx - antenna patterns in DPF
1023 // input am,AM - network amplitude vectors
1024 // returns: [(xp*xp+XP*XP)+(xx*xx+XX*XX)]/|f+|^2
1025  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1026  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1027  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1028  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1029  __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-9)); // fp*fp + epsilon
1030  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1031  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1032  return _mm_div_ps(_mm_add_ps(xp,xx),gp); // regularized projected energy
1033 }
1034 
1035 static inline __m128 _sse_like4_ps(__m128* am, __m128* AM) {
1036 // input am,AM - network projection vectors
1037  return _mm_add_ps(_mm_add_ps(_sse_abs4_ps(am),_sse_abs4_ps(AM)),_mm_set1_ps(1.e-12));
1038 }
1039 
1040 static inline __m128 _sse_reg4x_ps(__m128 _L, __m128* fx, __m128* am, __m128* AM, __m128 _D) {
1041 // x regulator (incoherent)
1042 // input _L - non-regulated likelihood
1043 // input fx - antenna pattern in DPF
1044 // input am,AM - network amplitude vectors
1045 // input _D - (delta) - regulator
1046 // returns: (delta*Lx/L-|fx^2|)/|fx^2+delta|
1047  static const __m128 _o = _mm_set1_ps(1.e-12);
1048  __m128 FF = _mm_add_ps(_sse_dot4_ps(fx,fx),_o); // fx*fx
1049  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1050  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1051  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1052  xx = _mm_div_ps(_mm_mul_ps(xx,_D),_mm_mul_ps(_L,FF)); // (Lx/L)*delta
1053  return _mm_div_ps(_mm_sub_ps(FF,xx),_mm_add_ps(FF,_D)); // [|fx|^2-(Lx/L)*delta]/|fx^2+delta|
1054 }
1055 
1056 static inline __m128 _sse_nind4_ps(__m128* _am, __m128* _AM) {
1057 // calculates network index
1058 // input fx - antenna pattern in DPF
1059 // input am,AM - network projection vectors
1060  __m128 _ll = _sse_abs4_ps(_am); // L00
1061  __m128 _LL = _sse_abs4_ps(_AM); // L00
1062  __m128 _ei = _sse_ei4_ps(_am,_ll); // 00 incoherent
1063  __m128 _EI = _sse_ei4_ps(_AM,_LL); // 90 incoherent
1064  _ll = _mm_add_ps(_mm_add_ps(_ll,_LL),_mm_set1_ps(1.e-12)); // standard likelihood
1065  return _mm_div_ps(_mm_add_ps(_ei,_EI),_ll); // network index
1066 }
1067 
1068 static inline void _sse_pol4_ps(__m128* _fp, __m128* _fx, __m128* _v, double* r, double* a) {
1069 // calculates the polar coordinates of the input vector v in the DPF frame
1070 // input _fp,_fx - antenna patterns in DPF
1071 // input _v - network projection vector pointer
1072 // input r,a - polar coordinates (r : radius, a : angle in radians)
1073 // r,a are pointers to 4 dimensional float arrays
1074 
1075  __m128 _ss,_cc;
1076  __m128 _oo = _mm_set1_ps(1.e-12);
1077  float rpol[4],cpol[4],spol[4];
1078 
1079  _cc = _sse_abs4_ps(_fp); // |fp|^2
1080  _cc = _mm_add_ps(_mm_sqrt_ps(_cc),_oo); // |fp|
1081  _cc = _mm_div_ps(_sse_dot4_ps(_fp,_v),_cc); // (fp,v)/|fp|
1082  _mm_storeu_ps(cpol,_cc);
1083 
1084  _ss = _sse_abs4_ps(_fx); // |fx|^2
1085  _ss = _mm_add_ps(_mm_sqrt_ps(_ss),_oo); // |fx|
1086  _ss = _mm_div_ps(_sse_dot4_ps(_fx,_v),_ss); // (fx,v)/|fx|
1087  _mm_storeu_ps(spol,_ss);
1088 
1089  _mm_storeu_ps(rpol,_sse_abs4_ps(_v)); // |v|^2
1090 
1091  for(int n=0;n<4;n++) {
1092  r[n] = sqrt(rpol[n]); // |v|
1093  a[n] = atan2(spol[n],cpol[n]); // atan2(spol,cpol)
1094  }
1095 
1096  return;
1097 }
1098 
1099 /*
1100 static inline __m128 _sse_reg4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D, __m128 _G) {
1101 // input fp,fx - antenna patterns in DPF
1102 // input am,AM - network amplitude vectors
1103 // input _D - (delta) - Tikhonov constraint
1104 // input _G - (gamma) - polarization constraint
1105 // calculate S^2 = sin(psi)^2, where psi is the angle between f+ and (xi+XI).
1106 // xi is the projection of x00 (_am) on the f+,fx plane
1107 // XI is the projection of x90 (_AM) on the f+,fx plane
1108 // is asymmetry s/(s+S)<T, return 0 otherwise return |xi|^2+|XI|^2
1109 // where s^2 = 4*|f+|*|fx|/(|f+|^2+|fx|^2)^2 and
1110 // and S^2 = 1-(|f+,xi|-|f+,XI|)^2/|f+|^2/(x00^2+x90^2)
1111 // the actual condition checked: s^2 < G*S^2
1112 
1113  static const __m128 sm = _mm_set1_ps(-0.f); // sign mask: -0.f = 1 << 31
1114  static const __m128 _1 = _mm_set1_ps(1);
1115  static const __m128 _2 = _mm_set1_ps(2);
1116  static const __m128 _o = _mm_set1_ps(1.e-12); // epsilon
1117  __m128 xp = _sse_dot4_ps(fp,am); // fp*am
1118  __m128 XP = _sse_dot4_ps(fp,AM); // fp*AM
1119  __m128 xx = _sse_dot4_ps(fx,am); // fx*am
1120  __m128 XX = _sse_dot4_ps(fx,AM); // fx*AM
1121  __m128 cc = _mm_sub_ps(_mm_mul_ps(xx,XP),_mm_mul_ps(xp,XX)); // cc=xx*XP-xp*XX
1122  __m128 ss = _mm_add_ps(_mm_mul_ps(xx,xp),_mm_mul_ps(XX,XP)); // ss=xx*xp+XX*XP
1123  cc = _mm_andnot_ps(sm,_mm_mul_ps(_mm_mul_ps(cc,ss),_2)); // cc=|2*cc*ss| (chk-ed)
1124  xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP)); // xp=xp*xp+XP*XP
1125  xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX)); // xx=xx*xx+XX*XX
1126  __m128 gp = _sse_dot4_ps(fp,fp); // fp*fp
1127  __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_o); // fx*fx + epsilon
1128  XX = _mm_add_ps(_mm_mul_ps(xx,gp),_mm_mul_ps(xp,gx)); // XX=xx*gp+xp*gx
1129  ss = _mm_add_ps(_sse_dot4_ps(am,am),_sse_dot4_ps(AM,AM)); // total energy
1130  ss = _mm_add_ps(ss,_D); // total energy + delta
1131  cc = _mm_div_ps(cc,_mm_add_ps(_mm_mul_ps(xx,ss),_o)); // second psi term - added
1132  ss = _mm_div_ps(xp,_mm_add_ps(_mm_mul_ps(gp,ss),_o)); // first psi term - subtracted
1133  ss = _mm_mul_ps(_G,_mm_add_ps(_mm_sub_ps(_1,ss),cc)); // G*sin(psi)^2
1134  cc = _mm_div_ps(_2,_mm_add_ps(gp,gx)); // cc=2*(1-T)/(gp+gx)
1135  cc = _mm_mul_ps(_mm_mul_ps(cc,cc),_mm_mul_ps(gp,gx)); // right is ready for comparison
1136  //cc = _mm_sqrt_ps(cc);
1137  //ss = _mm_sqrt_ps(ss);
1138  //cc = _mm_div_ps(cc,_mm_add_ps(ss,cc));
1139  cc = _mm_and_ps(_mm_cmpge_ps(cc,ss),_1); // 1 if cc>ss or 0 if cc<ss
1140  XX = _mm_div_ps(XX,_mm_mul_ps(_mm_add_ps(gp,_D),gx)); // std L with Tikhonov regulator
1141  return XX; // final projected energy
1142  return _mm_mul_ps(XX,cc); // final projected energy
1143 }
1144 
1145 
1146 static inline __m128 _sse_asy4_ps(__m128* _fp, __m128* _fx, __m128* _aa, __m128* _AA) {
1147 // calculate sample network asymmetry
1148 // _a - alignment factors
1149 // _e - ellipticity factors
1150 // _c - cos between f+ and 00 projection
1151 // return network asymmetry
1152  static const __m128 sm = _mm_set1_ps(-0.f); // sign mask: -0.f = 1 << 31
1153  static const __m128 _1 = _mm_set1_ps(1.);
1154  static const __m128 _2 = _mm_set1_ps(2.);
1155  static const __m128 _o = _mm_set1_ps(1.e-4);
1156  __m128 _a = _sse_dot4_ps(_fp,_fp); // |fp|^2
1157  __m128 _e = _sse_dot4_ps(_aa,_aa); // |x00|^2
1158  __m128 _c = _sse_dot4_ps(_fp,_aa); // (fp,aa)
1159  _c = _mm_div_ps(_mm_mul_ps(_c,_c),_mm_mul_ps(_a,_e)); // cos^2(psi)
1160  _a = _mm_add_ps(_mm_div_ps(_sse_dot4_ps(_fx,_fx),_a),_o); // a = (|fx|/|fp|)^2
1161  _e = _mm_div_ps(_sse_dot4_ps(_AA,_AA),_mm_mul_ps(_a,_e)); // (|x90|/|x00|)^2/a
1162  _e = _mm_div_ps(_1,_mm_add_ps(_1,_mm_sqrt_ps(_e))); // ee = 1/(1+sqrt(ee))
1163  _c = _mm_mul_ps(_mm_sub_ps(_1,_c),_mm_add_ps(_1,_a)); // _c = s*s*(1+a)
1164  _a = _mm_div_ps(_c,_mm_mul_ps(_2,_a)); // _a = s*s*(1+a)/(2*a)
1165  _a = _mm_div_ps(_1,_mm_add_ps(_1,_mm_sqrt_ps(_a))); // 1/(1+sqrt(aa))
1166  _a = _mm_div_ps(_mm_add_ps(_a,_e),_2); // (aa+ee)/2
1167  _e = _mm_andnot_ps(sm,_mm_sub_ps(_a,_e)); // |aa-ee|/2
1168  return _mm_sub_ps(_a,_e); // (aa-ee)/2-|aa-ee|/2
1169 }
1170 
1171 
1172 static inline float _sse_snc4_ps(__m128* _pe, float* _ne, float* out, int V4) {
1173 // sub net cut (snc)
1174 // _pe - pointer to energy vectors
1175 // _rE - pointer to network energy array
1176 // out - output array
1177 // V4 - number of iterations (pixels)
1178  __m128 _Etot = _mm_setzero_ps(); // total energy
1179  __m128 _Esub = _mm_setzero_ps(); // energy after subnet cut
1180  __m128* _rE = (__m128*) ne; // m128 pointer to energy array
1181 
1182  for(j=0; j<V4; j+=4) { // loop over selected pixels
1183  *_rE = _net_sum_ps(_pe);
1184 
1185  __m128 _cmp = _mm_cmpge_ps(*_rE,_En); // E>En
1186  __m128 _msk = _mm_and_ps(_cmp,_one); // 0/1 mask
1187 
1188  *_rE = _mm_mul_ps(*_rE,_msk); // zero sub-threshold pixels
1189  _Etot = _mm_add_ps(_Etot,*_rE);
1190 
1191  _cmp = _net_cut_ps(_rE, _pe, _Es, _cmp); // apply subnetwork cut
1192 
1193  _msk = _mm_and_ps(_cmp,_one); // 0/1 mask
1194  _Esub = _mm_add_ps(_Esub,_mm_mul_ps(*_rE++,_msk));
1195  *_pE++ = _mm_setzero_ps();
1196  }
1197 
1198  _mm_storeu_ps(etot,_Etot);
1199 }
1200 */
1201 /*
1202  gx = NET.dotx(Fx,Fx)+1.e-24;
1203  gI = NET.dotx(Fp,Fx);
1204  xp = NET.dotx(Fp,am);
1205  xx = NET.dotx(Fx,am);
1206  XP = NET.dotx(Fp,AM);
1207  XX = NET.dotx(Fx,AM);
1208 
1209 // find weak vector 00
1210 
1211  uc = (xp*gx - xx*gI); // u cos of rotation to PCF
1212  us = (xx*gp - xp*gI); // u sin of rotation to PCF
1213  um = NET.rotx(Fp,uc,Fx,us,u); // calculate u and return its norm
1214  et = NET.dotx(am,am);
1215  hh = NET.dotx(am,u,e);
1216  ec = (hh*hh - NET.dotx(e,e))/um;
1217 
1218  if(nn==0) SM.set(n,0.1*hh*hh/NET.dotx(e,e));
1219  else SM.add(n,0.1*hh*hh/NET.dotx(e,e));
1220 
1221  NET.rotx(u,hh/um,e,0.,aa); // normalize aa
1222 // ec = NET.dotx(aa,aa);
1223 
1224 // find weak vector 90
1225 
1226  uc = (XP*gx - XX*gI); // u cos of rotation to PCF
1227  us = (XX*gp - XP*gI); // u sin of rotation to PCF
1228  UM = NET.rotx(Fp,uc,Fx,us,U); // calculate u and return its norm
1229  ET = NET.dotx(AM,AM);
1230  HH = NET.dotx(AM,U,e);
1231  EC = (HH*HH - NET.dotx(e,e))/UM;
1232  NET.rotx(U,HH/UM,e,0.,AA); // normalize AA
1233 // EC = NET.dotx(AA,AA);
1234 // g2->Fill((ec+EC)/(et+ET));
1235 
1236 // transformation to DDF
1237 
1238  gp = NET.dotx(aa,aa)+1.e-24; // fp^2
1239  gx = NET.dotx(AA,AA)+1.e-24; // fx^2
1240  gI = NET.dotx(aa,AA); // fp*fx
1241  gR = (gp-gx)/2.;
1242  gr = (gp+gx)/2.;
1243  gc = sqrt(gR*gR+gI*gI); // norm of complex antenna pattern
1244  b = (gr-gc)/(gr+gc);
1245 
1246 // g0->Fill(sqrt(b));
1247 
1248  cc = sqrt((gc+gR)*(gc+gR)+gI*gI);
1249  ss = NET.rotx(aa,(gc+gR)/cc,AA,gI/cc,s); // s[k]
1250  xx = NET.rotx(am,(gc+gR)/cc,AM,gI/cc,x); // x[k]
1251  cc = sqrt((gc-gR)*(gc-gR)+gI*gI);
1252  SS = NET.rotx(aa,-(gc-gR)/cc,AA,gI/cc,S)+1.e-24; // S[k]
1253  XX = NET.rotx(am,-(gc-gR)/cc,AM,gI/cc,X)+1.e-24; // X[k]
1254 
1255 // cout<<ss<<" "<<NET.dotx(x,s)*NET.dotx(x,s)/ss<<endl;
1256 
1257 // Principle components
1258 
1259  hh = NET.dotx(x,s,e);
1260  ec = (hh*hh - NET.dotx(e,e))/ss;
1261  HH = NET.dotx(X,S,e);
1262  EC = (HH*HH - NET.dotx(e,e))/SS;
1263 
1264 */
1265 
1266 
1267 
1268 #endif // WATSSE_HH
1269 
1270 
1271 
1272 
1273 
1274 
1275 
1276 
1277 
1278 
1279 
1280 
1281 
1282 
1283 
1284 
1285 
static float _sse_abs_ps(__m128 *_a)
Definition: watsse.hh:119
static float _sse_dot_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:195
tuple f
Definition: cwb_online.py:91
printf("total live time: non-zero lags = %10.1f \n", liveTot)
static void _sse_hard4_ps(__m128 *_uu, __m128 *_am, __m128 *_AM, __m128 _c)
Definition: watsse.hh:66
static __m128 _sse_rnorm4_ps(__m128 *_p)
Definition: watsse.hh:171
static __m128 _sse_reg4x_ps(__m128 _L, __m128 *fx, __m128 *am, __m128 *AM, __m128 _D)
Definition: watsse.hh:1040
wavearray< double > a(hp.size())
WSeries< float > v[nIFO]
Definition: cwb_net.C:62
static void _sse_add4_ps(__m128 *_a, __m128 *_b, __m128 _c)
Definition: watsse.hh:244
int n
Definition: cwb_net.C:10
static __m128 _sse_abs4_ps(__m128 *_p)
Definition: watsse.hh:139
static void _sse_zero_ps(__m128 *_p)
Definition: watsse.hh:26
static __m128 _sse_ed4_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:855
static void _sse_dsp4_ps(__m128 *u, __m128 *v, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
Definition: watsse.hh:644
static __m128 _sse_ei4xu_ps(__m128 *_x, __m128 *_u, __m128 _L)
Definition: watsse.hh:716
static __m128 _sse_dot4_ps(__m128 *_p, __m128 *_q)
Definition: watsse.hh:205
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\t layers : "<< nLAYERS<< "\t dF(hz) : "<< dF<< "\t dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1)*itime+ifreq;double time=itime *dT;double freq=(ifreq >0)?ifreq *dF:dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
static void _sse_cpf4_ps(__m128 *_aa, __m128 *_pp)
Definition: watsse.hh:299
static float _sse_rotsub_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:381
static __m128 _sse_nind4_ps(__m128 *_am, __m128 *_AM)
Definition: watsse.hh:1056
#define _NET(P1, P2)
Definition: wat.hh:64
static void _sse_cpf_ps(float *a, __m128 *_p)
Definition: watsse.hh:289
int m
Definition: cwb_net.C:10
static void _sse_rotm_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
Definition: watsse.hh:406
static void _sse_rotadd_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:370
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
ofstream out
Definition: cwb_merge.C:196
static float _sse_nrg_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:354
wavearray< double > xx
Definition: TestFrame1.C:11
static __m128 _sse_ecoh4_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:760
gwavearray< double > * gx
static void _sse_ifcp4_ps(__m128 *_aa, __m128 *_bb, __m128 _c)
Definition: watsse.hh:98
static __m128 _sse_ind4_ps(__m128 *_p, __m128 _L)
Definition: watsse.hh:791
static void _sse_pol4_ps(__m128 *_fp, __m128 *_fx, __m128 *_v, double *r, double *a)
Definition: watsse.hh:1068
static __m128 _sse_like4w_ps(__m128 *fp, __m128 *fx, __m128 *am, __m128 *AM)
Definition: watsse.hh:1021
static void _sse_rot4m_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
Definition: watsse.hh:454
static void _sse_load_ps(__m128 *_p, float a)
Definition: watsse.hh:32
double e
regression r
Definition: Regression_H1.C:44
s s
Definition: cwb_net.C:137
static void _sse_add_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:230
static void _sse_mul4_ps(__m128 *_am, __m128 _c)
Definition: watsse.hh:57
static __m128 _sse_ei4_ps(__m128 *_u, __m128 _L)
Definition: watsse.hh:660
static void _sse_print_ps(__m128 *_p)
Definition: watsse.hh:19
static __m128 _sse_ed4i_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:934
int l
Definition: cbc_plots.C:434
static __m128 _sse_sum_ps(__m128 **_p)
Definition: watsse.hh:497
static void _sse_point_ps(__m128 **_p, float **p, short **m, int l, int n)
Definition: watsse.hh:484
static void _sse_mul_ps(__m128 *_a, float b)
Definition: watsse.hh:38
static __m128 _sse_div4_ps(__m128 *_v, __m128 *_u)
Definition: watsse.hh:163
static __m128 _sse_null4_ps(__m128 *_p, __m128 *_q)
Definition: watsse.hh:745
static float _sse_maxE_ps(__m128 *_a, __m128 *_A)
Definition: watsse.hh:536
static void _sse_minSNE_ps(__m128 *_pE, __m128 **_pe, __m128 *_es)
Definition: watsse.hh:522
static __m128 _sse_like4_ps(__m128 *_f, __m128 *_a, __m128 *_A)
Definition: watsse.hh:980
static void _sse_rot4p_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
Definition: watsse.hh:424
static void _sse_rotp_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
Definition: watsse.hh:398
static void _sse_sub4_ps(__m128 *_a, __m128 *_b, __m128 _c)
Definition: watsse.hh:270
static void _sse_dpf4_ps(__m128 *_Fp, __m128 *_Fx, __m128 *_fp, __m128 *_fx)
Definition: watsse.hh:613
static __m128 _sse_cut_ps(__m128 *_pE, __m128 **_pe, __m128 _Es, __m128 _cmp)
Definition: watsse.hh:510
static void _sse_pnp4_ps(__m128 *_fp, __m128 *_fx, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
Definition: watsse.hh:623
static __m128 _sse_ei4xx_ps(__m128 *_x, __m128 *_u, __m128 _L)
Definition: watsse.hh:687
static void _sse_ort4_ps(__m128 *_u, __m128 *_v, __m128 *_s, __m128 *_c)
Definition: watsse.hh:554
static void _sse_sub_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:263