Logo Coherent WaveBurst  
Reference Guide
Logo
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
watavx.hh
Go to the documentation of this file.
1 // Wavelet Analysis Tool
2 // S.Klimenko, University of Florida
3 // library of general functions
4 
5 #ifndef WATAVX_HH
6 #define WATAVX_HH
7 
8 #include <xmmintrin.h>
9 #include <pmmintrin.h> // need for hadd
10 #include <immintrin.h> // AVX
11 //#include <x86intrin.h>
12 #include <iostream>
13 #include <stdlib.h>
14 #include <math.h>
15 #include <TMath.h>
16 #include "wat.hh"
17 
18 // WAT AVX functions
19 
20 
21 static inline void _avx_free_ps(std::vector<float*> &v) {
22  for(int n=0; n<v.size(); n++) _mm_free(v[n]);
23  v.clear(); std::vector<float*>().swap(v);
24 }
25 static inline float _wat_hsum(__m128 y) {
26  float x[4]; _mm_storeu_ps(x,y);
27  return x[0]+x[1]+x[2]+x[3];
28 }
29 
30 static inline void _avx_cpf_ps(float** p, float ** q,
31  float** u, float ** v, int I) {
32 // copy arrays from p-->u and from q-->v
33  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
34  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
35  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
36  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
37  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
38  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
39  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
40  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
41 
42  NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
43  __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
44  __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
45  __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
46  __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
47  __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
48  __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
49  __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
50 
51  for(int i=0; i<I; i+=4) {
52  NETX(*_u0++ = *_p0++; *_v0++ = *_q0++; ,
53  *_u1++ = *_p1++; *_v1++ = *_q1++; ,
54  *_u2++ = *_p2++; *_v2++ = *_q2++; ,
55  *_u3++ = *_p3++; *_v3++ = *_q3++; ,
56  *_u4++ = *_p4++; *_v4++ = *_q4++; ,
57  *_u5++ = *_p5++; *_v5++ = *_q5++; ,
58  *_u6++ = *_p6++; *_v6++ = *_q6++; ,
59  *_u7++ = *_p7++; *_v7++ = *_q7++; )
60  }
61  return;
62 }
63 
64 
65 static inline float _avx_dpf_ps(double** Fp, double** Fx, int l,
66  std::vector<float*> &pAPN,
67  std::vector<float*> &pAVX, int I) {
68 // calculate dpf sin and cos and antenna f+ and fx amplitudes
69 // Fp, Fx array of pointers for antenna pattrens
70 // l - sky location index
71 // pRMS - vector with noise rms data
72 // pAVX - vector with pixel statistics
73 // in likelihoodWP these arrays should be stored exactly in the same order.
74 // this function increments pointers stored in tne input pointer arrays.
75  __m128* _fp = (__m128*)pAVX[2];
76  __m128* _fx = (__m128*)pAVX[3];
77  __m128* _si = (__m128*)pAVX[4];
78  __m128* _co = (__m128*)pAVX[5];
79  __m128* _ni = (__m128*)pAVX[18];
80  __m128 _ff,_FF,_fF,_AP,_cc,_ss,_nn,_cn,_sn;
81  static const __m128 _0 = _mm_set1_ps(0);
82  static const __m128 _1 = _mm_set1_ps(1);
83  static const __m128 _2 = _mm_set1_ps(2);
84  static const __m128 _o = _mm_set1_ps(0.0001);
85  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
86  __m128 _NI= _mm_setzero_ps();
87  __m128 _NN= _mm_setzero_ps();
88 
89  NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I); __m128* _r0=(__m128*)(pAPN[0]+2*I);,
90  __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I); __m128* _r1=(__m128*)(pAPN[1]+2*I);,
91  __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I); __m128* _r2=(__m128*)(pAPN[2]+2*I);,
92  __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I); __m128* _r3=(__m128*)(pAPN[3]+2*I);,
93  __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I); __m128* _r4=(__m128*)(pAPN[4]+2*I);,
94  __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I); __m128* _r5=(__m128*)(pAPN[5]+2*I);,
95  __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I); __m128* _r6=(__m128*)(pAPN[6]+2*I);,
96  __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I); __m128* _r7=(__m128*)(pAPN[7]+2*I);)
97 
98  NETX(__m128 _Fp0 = _mm_set1_ps(*(Fp[0]+l)); __m128 _Fx0 = _mm_set1_ps(*(Fx[0]+l)); ,
99  __m128 _Fp1 = _mm_set1_ps(*(Fp[1]+l)); __m128 _Fx1 = _mm_set1_ps(*(Fx[1]+l)); ,
100  __m128 _Fp2 = _mm_set1_ps(*(Fp[2]+l)); __m128 _Fx2 = _mm_set1_ps(*(Fx[2]+l)); ,
101  __m128 _Fp3 = _mm_set1_ps(*(Fp[3]+l)); __m128 _Fx3 = _mm_set1_ps(*(Fx[3]+l)); ,
102  __m128 _Fp4 = _mm_set1_ps(*(Fp[4]+l)); __m128 _Fx4 = _mm_set1_ps(*(Fx[4]+l)); ,
103  __m128 _Fp5 = _mm_set1_ps(*(Fp[5]+l)); __m128 _Fx5 = _mm_set1_ps(*(Fx[5]+l)); ,
104  __m128 _Fp6 = _mm_set1_ps(*(Fp[6]+l)); __m128 _Fx6 = _mm_set1_ps(*(Fx[6]+l)); ,
105  __m128 _Fp7 = _mm_set1_ps(*(Fp[7]+l)); __m128 _Fx7 = _mm_set1_ps(*(Fx[7]+l)); )
106 
107  float sign=0;
108  NETX(if(*(pAPN[0]+2*I)>0.) sign+=*(Fp[0]+l) * *(Fx[0]+l);,
109  if(*(pAPN[1]+2*I)>0.) sign+=*(Fp[1]+l) * *(Fx[1]+l);,
110  if(*(pAPN[2]+2*I)>0.) sign+=*(Fp[2]+l) * *(Fx[2]+l);,
111  if(*(pAPN[3]+2*I)>0.) sign+=*(Fp[3]+l) * *(Fx[3]+l);,
112  if(*(pAPN[4]+2*I)>0.) sign+=*(Fp[4]+l) * *(Fx[4]+l);,
113  if(*(pAPN[5]+2*I)>0.) sign+=*(Fp[5]+l) * *(Fx[5]+l);,
114  if(*(pAPN[6]+2*I)>0.) sign+=*(Fp[6]+l) * *(Fx[6]+l);,
115  if(*(pAPN[7]+2*I)>0.) sign+=*(Fp[7]+l) * *(Fx[7]+l);)
116  sign = sign>0 ? 1. : -1.;
117  __m128 _sign = _mm_set1_ps(sign);
118 
119  for(int i=0; i<I; i+=4) {
120  NETX(
121  *_f0 = _mm_mul_ps(*_r0,_Fp0); *_F0 = _mm_mul_ps(*_r0++,_Fx0);
122  _ff = _mm_mul_ps(*_f0,*_f0);
123  _FF = _mm_mul_ps(*_F0,*_F0);
124  _fF = _mm_mul_ps(*_f0,*_F0); ,
125 
126  *_f1 = _mm_mul_ps(*_r1,_Fp1); *_F1 = _mm_mul_ps(*_r1++,_Fx1);
127  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f1,*_f1));
128  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F1,*_F1));
129  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f1,*_F1)); ,
130 
131  *_f2 = _mm_mul_ps(*_r2,_Fp2); *_F2 = _mm_mul_ps(*_r2++,_Fx2);
132  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f2,*_f2));
133  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F2,*_F2));
134  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f2,*_F2)); ,
135 
136  *_f3 = _mm_mul_ps(*_r3,_Fp3); *_F3 = _mm_mul_ps(*_r3++,_Fx3);
137  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f3,*_f3));
138  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F3,*_F3));
139  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f3,*_F3)); ,
140 
141  *_f4 = _mm_mul_ps(*_r4,_Fp4); *_F4 = _mm_mul_ps(*_r4++,_Fx4);
142  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f4,*_f4));
143  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F4,*_F4));
144  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f4,*_F4)); ,
145 
146  *_f5 = _mm_mul_ps(*_r5,_Fp5); *_F5 = _mm_mul_ps(*_r5++,_Fx5);
147  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f5,*_f5));
148  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F5,*_F5));
149  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f5,*_F5)); ,
150 
151  *_f6 = _mm_mul_ps(*_r6,_Fp6); *_F6 = _mm_mul_ps(*_r6++,_Fx6);
152  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f6,*_f6));
153  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F6,*_F6));
154  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f6,*_F6)); ,
155 
156  *_f7 = _mm_mul_ps(*_r7,_Fp7); *_F7 = _mm_mul_ps(*_r7++,_Fx7);
157  _ff = _mm_add_ps(_ff,_mm_mul_ps(*_f7,*_f7));
158  _FF = _mm_add_ps(_FF,_mm_mul_ps(*_F7,*_F7));
159  _fF = _mm_add_ps(_fF,_mm_mul_ps(*_f7,*_F7)); )
160 
161  *_si = _mm_mul_ps(_fF,_2); // rotation 2*sin*cos*norm
162  *_co = _mm_sub_ps(_ff,_FF); // rotation (cos^2-sin^2)*norm
163  _AP = _mm_add_ps(_ff,_FF); // total antenna norm
164  _cc = _mm_mul_ps(*_co,*_co);
165  _ss = _mm_mul_ps(*_si,*_si);
166  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
167  *_fp = _mm_div_ps(_mm_add_ps(_AP,_nn),_2); // |f+|^2
168  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
169  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
170  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
171  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
172  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
173  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
174 
175 // DPF antenna patterns
176  NETX(
177  _ff = _mm_add_ps(_mm_mul_ps(*_f0,*_co),_mm_mul_ps(*_F0,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
178  _FF = _mm_sub_ps(_mm_mul_ps(*_F0,*_co),_mm_mul_ps(*_f0,*_si)); *_f0 = _ff; *_F0 = _FF; // fx
179  _nn = _mm_mul_ps(_cc,_cc); _fF = _mm_mul_ps(_ff,_FF);, // ni
180 
181  _ff = _mm_add_ps(_mm_mul_ps(*_f1,*_co),_mm_mul_ps(*_F1,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
182  _FF = _mm_sub_ps(_mm_mul_ps(*_F1,*_co),_mm_mul_ps(*_f1,*_si)); *_f1 = _ff; *_F1 = _FF; // fx
183  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
184 
185  _ff = _mm_add_ps(_mm_mul_ps(*_f2,*_co),_mm_mul_ps(*_F2,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
186  _FF = _mm_sub_ps(_mm_mul_ps(*_F2,*_co),_mm_mul_ps(*_f2,*_si)); *_f2 = _ff; *_F2 = _FF; // fx
187  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
188 
189  _ff = _mm_add_ps(_mm_mul_ps(*_f3,*_co),_mm_mul_ps(*_F3,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
190  _FF = _mm_sub_ps(_mm_mul_ps(*_F3,*_co),_mm_mul_ps(*_f3,*_si)); *_f3 = _ff; *_F3 = _FF; // fx
191  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
192 
193  _ff = _mm_add_ps(_mm_mul_ps(*_f4,*_co),_mm_mul_ps(*_F4,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
194  _FF = _mm_sub_ps(_mm_mul_ps(*_F4,*_co),_mm_mul_ps(*_f4,*_si)); *_f4 = _ff; *_F4 = _FF; // fx
195  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
196 
197  _ff = _mm_add_ps(_mm_mul_ps(*_f5,*_co),_mm_mul_ps(*_F5,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
198  _FF = _mm_sub_ps(_mm_mul_ps(*_F5,*_co),_mm_mul_ps(*_f5,*_si)); *_f5 = _ff; *_F5 = _FF; // fx
199  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
200 
201  _ff = _mm_add_ps(_mm_mul_ps(*_f6,*_co),_mm_mul_ps(*_F6,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
202  _FF = _mm_sub_ps(_mm_mul_ps(*_F6,*_co),_mm_mul_ps(*_f6,*_si)); *_f6 = _ff; *_F6 = _FF; // fx
203  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));, // ni
204 
205  _ff = _mm_add_ps(_mm_mul_ps(*_f7,*_co),_mm_mul_ps(*_F7,*_si)); _cc = _mm_mul_ps(_ff,_ff); // f+
206  _FF = _mm_sub_ps(_mm_mul_ps(*_F7,*_co),_mm_mul_ps(*_f7,*_si)); *_f7 = _ff; *_F7 = _FF; // fx
207  _nn = _mm_add_ps(_nn,_mm_mul_ps(_cc,_cc)); _fF = _mm_add_ps(_fF,_mm_mul_ps(_ff,_FF));) // ni
208 
209  _fF = _mm_div_ps(_fF,_mm_add_ps(*_fp,_o));
210 
211  NETX(
212  *_F0 = _mm_sub_ps(*_F0,_mm_mul_ps(*_f0++,_fF)); *_fx = _mm_mul_ps(*_F0,*_F0); _F0++;,
213  *_F1 = _mm_sub_ps(*_F1,_mm_mul_ps(*_f1++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F1,*_F1)); _F1++;,
214  *_F2 = _mm_sub_ps(*_F2,_mm_mul_ps(*_f2++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F2,*_F2)); _F2++;,
215  *_F3 = _mm_sub_ps(*_F3,_mm_mul_ps(*_f3++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F3,*_F3)); _F3++;,
216  *_F4 = _mm_sub_ps(*_F4,_mm_mul_ps(*_f4++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F4,*_F4)); _F4++;,
217  *_F5 = _mm_sub_ps(*_F5,_mm_mul_ps(*_f5++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F5,*_F5)); _F5++;,
218  *_F6 = _mm_sub_ps(*_F6,_mm_mul_ps(*_f6++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F6,*_F6)); _F6++;,
219  *_F7 = _mm_sub_ps(*_F7,_mm_mul_ps(*_f7++,_fF)); *_fx = _mm_add_ps(*_fx,_mm_mul_ps(*_F7,*_F7)); _F7++;)
220 
221  *_ni = _mm_div_ps(_nn,_mm_add_ps(_mm_mul_ps(*_fp,*_fp),_o)); // network index
222  _ff = _mm_add_ps(_mm_mul_ps(_1,*_ni),_o); // network index
223  _NI = _mm_add_ps(_NI,(_mm_div_ps(*_fx,_ff))); // sum of |fx|^2/2/ni
224  _NN = _mm_add_ps(_NN,_mm_and_ps(_mm_cmpgt_ps(*_fp,_0),_1)); // pixel count
225 
226  _fp++; _fx++; _si++; _co++; _ni++;
227  }
228  return sqrt(_wat_hsum(_NI)/(_wat_hsum(_NN)+0.01));
229 }
230 
231 
232 static inline float _avx_GW_ps(float** p, float ** q,
233  std::vector<float*> &pAPN, float* rr,
234  std::vector<float*> &pAVX, int II) {
235 // initialize GW strain amplitudes and impose regulator
236 // p,q - input - data vector
237 // pAVX - pixel statistics
238 // J - number of AVX pixels
239 // in likelihoodWP these arrays should be stored exactly in the same order.
240 // this function updates the definition of the mask array:
241 
242  int I = abs(II);
243 
244  __m128* _et = (__m128*)pAVX[0];
245  __m128* _mk = (__m128*)pAVX[1];
246  __m128* _fp = (__m128*)pAVX[2];
247  __m128* _fx = (__m128*)pAVX[3];
248  __m128* _au = (__m128*)pAVX[10];
249  __m128* _AU = (__m128*)pAVX[11];
250  __m128* _av = (__m128*)pAVX[12];
251  __m128* _AV = (__m128*)pAVX[13];
252  __m128* _ni = (__m128*)pAVX[18];
253 
254  __m128 _uu = _mm_setzero_ps();
255  __m128 _UU = _mm_setzero_ps();
256  __m128 _uU = _mm_setzero_ps();
257  __m128 _vv = _mm_setzero_ps();
258  __m128 _VV = _mm_setzero_ps();
259  __m128 _vV = _mm_setzero_ps();
260  __m128 _NN = _mm_setzero_ps();
261 
262  __m128 _h,_H,_f,_F,_R,_a,_nn,_m,_ff,_xp,_XP,_xx,_XX;
263  float cu,su,cv,sv,cc,ss,nn,uu,UU,vv,VV,et,ET;
264  static const __m128 _0 = _mm_set1_ps(0);
265  static const __m128 _1 = _mm_set1_ps(1);
266  static const __m128 o1 = _mm_set1_ps(0.1);
267  static const __m128 _o = _mm_set1_ps(1.e-5);
268  __m128 _rr = _mm_set1_ps(rr[0]);
269  __m128 _RR = _mm_set1_ps(rr[1]);
270 
271  // pointers to antenna patterns
272  NETX(__m128* _f0=(__m128*)pAPN[0]; __m128* _F0=(__m128*)(pAPN[0]+I);,
273  __m128* _f1=(__m128*)pAPN[1]; __m128* _F1=(__m128*)(pAPN[1]+I);,
274  __m128* _f2=(__m128*)pAPN[2]; __m128* _F2=(__m128*)(pAPN[2]+I);,
275  __m128* _f3=(__m128*)pAPN[3]; __m128* _F3=(__m128*)(pAPN[3]+I);,
276  __m128* _f4=(__m128*)pAPN[4]; __m128* _F4=(__m128*)(pAPN[4]+I);,
277  __m128* _f5=(__m128*)pAPN[5]; __m128* _F5=(__m128*)(pAPN[5]+I);,
278  __m128* _f6=(__m128*)pAPN[6]; __m128* _F6=(__m128*)(pAPN[6]+I);,
279  __m128* _f7=(__m128*)pAPN[7]; __m128* _F7=(__m128*)(pAPN[7]+I);)
280 
281  // pointers to data
282  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0];,
283  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1];,
284  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2];,
285  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3];,
286  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4];,
287  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5];,
288  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6];,
289  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7];)
290 
291  for(int i=0; i<I; i+=4) { // calculate scalar products
292 
293  NETX(
294  _xp = _mm_mul_ps(*_f0,*_p0); // (x,f+)
295  _XP = _mm_mul_ps(*_f0,*_q0); // (X,f+)
296  _xx = _mm_mul_ps(*_F0,*_p0); // (x,fx)
297  _XX = _mm_mul_ps(*_F0,*_q0); , // (X,fx)
298 
299  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f1,*_p1)); // (x,f+)
300  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f1,*_q1)); // (X,f+)
301  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F1,*_p1)); // (x,fx)
302  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F1,*_q1)); , // (X,fx)
303 
304  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f2,*_p2)); // (x,f+)
305  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f2,*_q2)); // (X,f+)
306  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F2,*_p2)); // (x,fx)
307  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F2,*_q2)); , // (X,fx)
308 
309  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f3,*_p3)); // (x,f+)
310  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f3,*_q3)); // (X,f+)
311  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F3,*_p3)); // (x,fx)
312  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F3,*_q3)); , // (X,fx)
313 
314  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f4,*_p4)); // (x,f+)
315  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f4,*_q4)); // (X,f+)
316  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F4,*_p4)); // (x,fx)
317  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F4,*_q4)); , // (X,fx)
318 
319  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f5,*_p5)); // (x,f+)
320  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f5,*_q5)); // (X,f+)
321  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F5,*_p5)); // (x,fx)
322  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F5,*_q5)); , // (X,fx)
323 
324  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f6,*_p6)); // (x,f+)
325  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f6,*_q6)); // (X,f+)
326  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F6,*_p6)); // (x,fx)
327  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F6,*_q6)); , // (X,fx)
328 
329  _xp = _mm_add_ps(_xp,_mm_mul_ps(*_f7,*_p7)); // (x,f+)
330  _XP = _mm_add_ps(_XP,_mm_mul_ps(*_f7,*_q7)); // (X,f+)
331  _xx = _mm_add_ps(_xx,_mm_mul_ps(*_F7,*_p7)); // (x,fx)
332  _XX = _mm_add_ps(_XX,_mm_mul_ps(*_F7,*_q7)); ) // (X,fx)
333 
334 // regulator
335 
336  _f = _mm_add_ps(_mm_mul_ps(_xp,_xp),_mm_mul_ps(_XP,_XP)); // f=(x,f+)^2+(X,f+)^2
337  _f = _mm_div_ps(_f,_mm_add_ps(*_et,_o)); // f=f/[ni*(|x|^2+|X|^2)]
338  _f = _mm_sqrt_ps(_mm_mul_ps(*_ni,_f)); // f=ni*sqrt[(x,f+)^2+(X,f+)^2/(|x|^2+|X|^2)]
339  _f = _mm_sub_ps(_mm_mul_ps(_f,_rr),*_fp); // f*R-|f+|^2
340  _f = _mm_mul_ps(_f,_mm_and_ps(_mm_cmpgt_ps(_f,_0),_1)); // f if f*R>|f+|^2 or 0 if f*R<|f+|^2
341  _f = _mm_div_ps(*_mk,_mm_add_ps(*_fp,_mm_add_ps(_o,_f))); // 1 / {|f+|^2+epsilon}
342 
343  _h = _mm_mul_ps(_xp,_f); // use inequality
344  _H = _mm_mul_ps(_XP,_f); // fx^2 *(s^2+e^2*c^2)/(1+e^2) < fx^2
345  _h = _mm_add_ps(_mm_mul_ps(_h,_h),_mm_mul_ps(_H,_H));
346  _H = _mm_add_ps(_mm_mul_ps(_xx,_xx),_mm_mul_ps(_XX,_XX)); // (x,fx)^2+(X,fx)^2
347  _F = _mm_sqrt_ps(_mm_div_ps(_H,_mm_add_ps(_h,_o)));
348  _R = _mm_add_ps(o1,_mm_div_ps(_RR,_mm_add_ps(*_et,_o))); // dynamic x-regulator
349  _F = _mm_sub_ps(_mm_mul_ps(_F,_R),*_fx); // F*R-|fx|^2
350  _F = _mm_mul_ps(_F,_mm_and_ps(_mm_cmpgt_ps(_F,_0),_1)); // F*R-|fx|^2 if F*R>|fx|^2 or 0 if F*R<|fx|^2
351  _F = _mm_div_ps(*_mk,_mm_add_ps(*_fx,_mm_add_ps(_o,_F))); // 1/ {|fx|^2+epsilon}
352 
353  *_au = _mm_mul_ps(_xp,_f);
354  *_AU = _mm_mul_ps(_XP,_f);
355  *_av = _mm_mul_ps(_xx,_F);
356  *_AV = _mm_mul_ps(_XX,_F);
357 
358  _a = _mm_add_ps(_mm_mul_ps(_f,*_fp),_mm_mul_ps(_F,*_fx)); // Gaussin noise correction
359  _NN = _mm_add_ps(_NN,*_mk); // number of pixels
360  *_mk = _mm_add_ps(_a,_mm_sub_ps(*_mk,_1)); // -1 - rejected, >=0 accepted
361 
362  _mk++; _et++; _fp++; _fx++; _ni++;
363 
364 // set GW amplitudes in wavelet domain
365  NETX(*_p0++ = _mm_add_ps(_mm_mul_ps(*_f0, *_au),_mm_mul_ps(*_F0 ,*_av));
366  *_q0++ = _mm_add_ps(_mm_mul_ps(*_f0++,*_AU),_mm_mul_ps(*_F0++,*_AV)); ,
367  *_p1++ = _mm_add_ps(_mm_mul_ps(*_f1 ,*_au),_mm_mul_ps(*_F1 ,*_av));
368  *_q1++ = _mm_add_ps(_mm_mul_ps(*_f1++,*_AU),_mm_mul_ps(*_F1++,*_AV)); ,
369  *_p2++ = _mm_add_ps(_mm_mul_ps(*_f2 ,*_au),_mm_mul_ps(*_F2 ,*_av));
370  *_q2++ = _mm_add_ps(_mm_mul_ps(*_f2++,*_AU),_mm_mul_ps(*_F2++,*_AV)); ,
371  *_p3++ = _mm_add_ps(_mm_mul_ps(*_f3 ,*_au),_mm_mul_ps(*_F3 ,*_av));
372  *_q3++ = _mm_add_ps(_mm_mul_ps(*_f3++,*_AU),_mm_mul_ps(*_F3++,*_AV)); ,
373  *_p4++ = _mm_add_ps(_mm_mul_ps(*_f4 ,*_au),_mm_mul_ps(*_F4 ,*_av));
374  *_q4++ = _mm_add_ps(_mm_mul_ps(*_f4++,*_AU),_mm_mul_ps(*_F4++,*_AV)); ,
375  *_p5++ = _mm_add_ps(_mm_mul_ps(*_f5 ,*_au),_mm_mul_ps(*_F5 ,*_av));
376  *_q5++ = _mm_add_ps(_mm_mul_ps(*_f5++,*_AU),_mm_mul_ps(*_F5++,*_AV)); ,
377  *_p6++ = _mm_add_ps(_mm_mul_ps(*_f6 ,*_au),_mm_mul_ps(*_F6 ,*_av));
378  *_q6++ = _mm_add_ps(_mm_mul_ps(*_f6++,*_AU),_mm_mul_ps(*_F6++,*_AV)); ,
379  *_p7++ = _mm_add_ps(_mm_mul_ps(*_f7 ,*_au),_mm_mul_ps(*_F7 ,*_av));
380  *_q7++ = _mm_add_ps(_mm_mul_ps(*_f7++,*_AU),_mm_mul_ps(*_F7++,*_AV)); )
381 
382  _uU = _mm_add_ps(_uU,_mm_mul_ps(*_au,*_AU));
383  _vV = _mm_add_ps(_vV,_mm_mul_ps(*_av,*_AV));
384  _uu = _mm_add_ps(_uu,_mm_mul_ps(*_au,*_au)); _au++;
385  _UU = _mm_add_ps(_UU,_mm_mul_ps(*_AU,*_AU)); _AU++;
386  _vv = _mm_add_ps(_vv,_mm_mul_ps(*_av,*_av)); _av++;
387  _VV = _mm_add_ps(_VV,_mm_mul_ps(*_AV,*_AV)); _AV++;
388  }
389  su = _wat_hsum(_uU); sv = _wat_hsum(_vV); // rotation sin*cos*norm
390  uu = _wat_hsum(_uu); vv = _wat_hsum(_vv); // 00 energy
391  UU = _wat_hsum(_UU); VV = _wat_hsum(_VV); // 90 energy
392 
393 // first packet
394  nn = sqrt((uu-UU)*(uu-UU)+4*su*su)+0.0001; // co/si norm
395  cu = (uu-UU)/nn; et=uu+UU; // rotation cos(2p) and sin(2p)
396  uu = sqrt((et+nn)/2); UU = et>nn?sqrt((et-nn)/2):0; // amplitude of first/second component
397  nn = su>0 ? 1 : -1; // norm^2 of 2*cos^2 and 2*sin*cos
398  su = sqrt((1-cu)/2); cu = nn*sqrt((1+cu)/2); // normalized rotation sin/cos
399 
400 // second packet
401  nn = sqrt((vv-VV)*(vv-VV)+4*sv*sv)+0.0001; // co/si norm
402  cv = (vv-VV)/nn; ET=vv+VV; // rotation cos(2p) and sin(2p)
403  vv = sqrt((ET+nn)/2); VV = et>nn?sqrt((ET-nn)/2):0; // first/second component energy
404  nn = sv>0 ? 1 : -1; // norm^2 of 2*cos^2 and 2*sin*cos
405  sv = sqrt((1-cv)/2); cv = nn*sqrt((1+cv)/2); // normalized rotation sin/cos// first packet
406 
407  //return _mm_set_ps(ET,et,VV+vv,UU+uu); // reversed order
408  return _wat_hsum(_NN); // returns number of pixels above threshold
409 }
410 
411 
412 static inline float _avx_loadata_ps(float** p, float** q,
413  float** u, float** v, float En,
414  std::vector<float*> &pAVX, int I) {
415 // load data vectors p,q into temporary arrays u,v
416 // calculate total energy of network pixels stored in pAVX[5]
417 // apply energy threshold En, initialize pixel mask stored in pAVX[24]
418 // returns total energy
419 // in likelihoodWP these arrays should be stored exactly in the same order.
420 // this function increments pointers stored in tne input pointer arrays.
421  __m128 _aa, _AA, _aA;
422 
423  static const __m128 _1 = _mm_set1_ps(1);
424  static const __m128 _o = _mm_set1_ps(1.e-12);
425  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
426  static const __m128 _En= _mm_set1_ps(En);
427 
428  __m128* _et = (__m128*)pAVX[0];
429  __m128* _mk = (__m128*)pAVX[1];
430  __m128 _ee = _mm_setzero_ps();
431  __m128 _EE = _mm_setzero_ps();
432  __m128 _NN = _mm_setzero_ps();
433 
434  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
435  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
436  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
437  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
438  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
439  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
440  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
441  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
442 
443  NETX(__m128* _u0 = (__m128*)u[0]; __m128* _v0 = (__m128*)v[0]; ,
444  __m128* _u1 = (__m128*)u[1]; __m128* _v1 = (__m128*)v[1]; ,
445  __m128* _u2 = (__m128*)u[2]; __m128* _v2 = (__m128*)v[2]; ,
446  __m128* _u3 = (__m128*)u[3]; __m128* _v3 = (__m128*)v[3]; ,
447  __m128* _u4 = (__m128*)u[4]; __m128* _v4 = (__m128*)v[4]; ,
448  __m128* _u5 = (__m128*)u[5]; __m128* _v5 = (__m128*)v[5]; ,
449  __m128* _u6 = (__m128*)u[6]; __m128* _v6 = (__m128*)v[6]; ,
450  __m128* _u7 = (__m128*)u[7]; __m128* _v7 = (__m128*)v[7]; )
451 
452  for(int i=0; i<I; i+=4) {
453  NETX(
454  _aa = _mm_mul_ps(*_p0,*_p0); *_u0++ = *_p0++;
455  _AA = _mm_mul_ps(*_q0,*_q0); *_v0++ = *_q0++; ,
456  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1)); *_u1++ = *_p1++;
457  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1)); *_v1++ = *_q1++; ,
458  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2)); *_u2++ = *_p2++;
459  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2)); *_v2++ = *_q2++; ,
460  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3)); *_u3++ = *_p3++;
461  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3)); *_v3++ = *_q3++; ,
462  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4)); *_u4++ = *_p4++;
463  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4)); *_v4++ = *_q4++; ,
464  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5)); *_u5++ = *_p5++;
465  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5)); *_v5++ = *_q5++; ,
466  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6)); *_u6++ = *_p6++;
467  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6)); *_v6++ = *_q6++; ,
468  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7)); *_u7++ = *_p7++;
469  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7)); *_v7++ = *_q7++; )
470 
471 
472 // calculate data network orthogonalization sin and cos
473 
474  *_et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
475  *_mk = _mm_and_ps(_mm_cmpgt_ps(*_et,_En),_1); // 1 if et>En or 0 if et<En
476  _NN = _mm_add_ps(_NN,*_mk); // total energy threshold
477  _ee = _mm_add_ps(_ee,*_et); // total energy threshold
478  *_et = _mm_mul_ps(*_et,*_mk++); // apply En threshold
479  _EE = _mm_add_ps(_EE,*_et++); // total energy above threshold
480  }
481  pAVX[1][I+1] = _wat_hsum(_NN); // store number of pixels
482  return _wat_hsum(_EE)/2; // total energy in one quadrature
483 }
484 
485 static inline void _avx_loadNULL_ps(float** n, float** N,
486  float** d, float** D,
487  float** h, float** H, int I) {
488 // load NULL packet amplitudes for all detectors and pixels
489 // these amplitudes are used for reconstruction of data time searies
490 // now works only for <4 detector
491  NETX(__m128* _n0 = (__m128*)n[0]; __m128* _N0 = (__m128*)N[0]; ,
492  __m128* _n1 = (__m128*)n[1]; __m128* _N1 = (__m128*)N[1]; ,
493  __m128* _n2 = (__m128*)n[2]; __m128* _N2 = (__m128*)N[2]; ,
494  __m128* _n3 = (__m128*)n[3]; __m128* _N3 = (__m128*)N[3]; ,
495  __m128* _n4 = (__m128*)n[4]; __m128* _N4 = (__m128*)N[4]; ,
496  __m128* _n5 = (__m128*)n[5]; __m128* _N5 = (__m128*)N[5]; ,
497  __m128* _n6 = (__m128*)n[6]; __m128* _N6 = (__m128*)N[6]; ,
498  __m128* _n7 = (__m128*)n[7]; __m128* _N7 = (__m128*)N[7]; )
499 
500  NETX(__m128* _d0 = (__m128*)d[0]; __m128* _D0 = (__m128*)D[0]; ,
501  __m128* _d1 = (__m128*)d[1]; __m128* _D1 = (__m128*)D[1]; ,
502  __m128* _d2 = (__m128*)d[2]; __m128* _D2 = (__m128*)D[2]; ,
503  __m128* _d3 = (__m128*)d[3]; __m128* _D3 = (__m128*)D[3]; ,
504  __m128* _d4 = (__m128*)d[4]; __m128* _D4 = (__m128*)D[4]; ,
505  __m128* _d5 = (__m128*)d[5]; __m128* _D5 = (__m128*)D[5]; ,
506  __m128* _d6 = (__m128*)d[6]; __m128* _D6 = (__m128*)D[6]; ,
507  __m128* _d7 = (__m128*)d[7]; __m128* _D7 = (__m128*)D[7]; )
508 
509  NETX(__m128* _h0 = (__m128*)h[0]; __m128* _H0 = (__m128*)H[0]; ,
510  __m128* _h1 = (__m128*)h[1]; __m128* _H1 = (__m128*)H[1]; ,
511  __m128* _h2 = (__m128*)h[2]; __m128* _H2 = (__m128*)H[2]; ,
512  __m128* _h3 = (__m128*)h[3]; __m128* _H3 = (__m128*)H[3]; ,
513  __m128* _h4 = (__m128*)h[4]; __m128* _H4 = (__m128*)H[4]; ,
514  __m128* _h5 = (__m128*)h[5]; __m128* _H5 = (__m128*)H[5]; ,
515  __m128* _h6 = (__m128*)h[6]; __m128* _H6 = (__m128*)H[6]; ,
516  __m128* _h7 = (__m128*)h[7]; __m128* _H7 = (__m128*)H[7]; )
517 
518  for(int i=0; i<I; i+=4) {
519  NETX(*_n0++ = _mm_sub_ps(*_d0++,*_h0++); *_N0++ = _mm_sub_ps(*_D0++,*_H0++); ,
520  *_n1++ = _mm_sub_ps(*_d1++,*_h1++); *_N1++ = _mm_sub_ps(*_D1++,*_H1++); ,
521  *_n2++ = _mm_sub_ps(*_d2++,*_h2++); *_N2++ = _mm_sub_ps(*_D2++,*_H2++); ,
522  *_n3++ = _mm_sub_ps(*_d3++,*_h3++); *_N3++ = _mm_sub_ps(*_D3++,*_H3++); ,
523  *_n4++ = _mm_sub_ps(*_d4++,*_h4++); *_N4++ = _mm_sub_ps(*_D4++,*_H4++); ,
524  *_n5++ = _mm_sub_ps(*_d5++,*_h5++); *_N5++ = _mm_sub_ps(*_D5++,*_H5++); ,
525  *_n6++ = _mm_sub_ps(*_d6++,*_h6++); *_N6++ = _mm_sub_ps(*_D6++,*_H6++); ,
526  *_n7++ = _mm_sub_ps(*_d7++,*_h7++); *_N7++ = _mm_sub_ps(*_D7++,*_H7++); )
527  }
528  return;
529 }
530 
531 
532 static inline float _avx_packet_ps(float** p, float** q,
533  std::vector<float*> &pAVX, int I) {
534 // calculates packet rotation sin/cos, amplitudes and unit vectors
535 // initialize unit vector arrays stored in pAVX in format used in
536 // in likelihoodWP these arrays should be stored exactly in the same order.
537 // this function increments pointers stored in tne input pointer arrays.
538 
539  int II = abs(I)*2;
540  __m128 _a, _A, _aa, _AA, _aA, _x, _cc, _ss, _nn, _cn, _sn, _mk;
541 
542  static const __m128 _0 = _mm_set1_ps(0);
543  static const __m128 _1 = _mm_set1_ps(1);
544  static const __m128 _2 = _mm_set1_ps(2);
545  static const __m128 _o = _mm_set1_ps(0.0001);
546  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
547 
548  float a[8] __attribute__((aligned(32))); __m128* _am = (__m128*)a;
549  float A[8] __attribute__((aligned(32))); __m128* _AM = (__m128*)A;
550  float s[8] __attribute__((aligned(32))); __m128* _si = (__m128*)s;
551  float c[8] __attribute__((aligned(32))); __m128* _co = (__m128*)c;
552 
553  __m128* _MK = (__m128*)(pAVX[1]);
554  __m128 aa[8], AA[8], aA[8], si[8], co[8];
555  for(int i=0; i<8; i++) {
556  aa[i] = _mm_setzero_ps();
557  AA[i] = _mm_setzero_ps();
558  aA[i] = _mm_setzero_ps();
559  }
560 
561  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
562  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
563  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
564  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
565  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
566  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
567  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
568  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
569 
570  for(int i=0; i<I; i+=4) {
571  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1); // event mask
572  NETX(
573  aa[0] = _mm_add_ps(aa[0],_mm_mul_ps(_mm_mul_ps(*_p0,*_p0), _mk));
574  AA[0] = _mm_add_ps(AA[0],_mm_mul_ps(_mm_mul_ps(*_q0,*_q0), _mk));
575  aA[0] = _mm_add_ps(aA[0],_mm_mul_ps(_mm_mul_ps(*_p0++,*_q0++),_mk));,
576 
577  aa[1] = _mm_add_ps(aa[1],_mm_mul_ps(_mm_mul_ps(*_p1,*_p1), _mk));
578  AA[1] = _mm_add_ps(AA[1],_mm_mul_ps(_mm_mul_ps(*_q1,*_q1), _mk));
579  aA[1] = _mm_add_ps(aA[1],_mm_mul_ps(_mm_mul_ps(*_p1++,*_q1++),_mk));,
580 
581  aa[2] = _mm_add_ps(aa[2],_mm_mul_ps(_mm_mul_ps(*_p2,*_p2), _mk));
582  AA[2] = _mm_add_ps(AA[2],_mm_mul_ps(_mm_mul_ps(*_q2,*_q2), _mk));
583  aA[2] = _mm_add_ps(aA[2],_mm_mul_ps(_mm_mul_ps(*_p2++,*_q2++),_mk));,
584 
585  aa[3] = _mm_add_ps(aa[3],_mm_mul_ps(_mm_mul_ps(*_p3,*_p3), _mk));
586  AA[3] = _mm_add_ps(AA[3],_mm_mul_ps(_mm_mul_ps(*_q3,*_q3), _mk));
587  aA[3] = _mm_add_ps(aA[3],_mm_mul_ps(_mm_mul_ps(*_p3++,*_q3++),_mk));,
588 
589  aa[4] = _mm_add_ps(aa[4],_mm_mul_ps(_mm_mul_ps(*_p4,*_p4), _mk));
590  AA[4] = _mm_add_ps(AA[4],_mm_mul_ps(_mm_mul_ps(*_q4,*_q4), _mk));
591  aA[4] = _mm_add_ps(aA[4],_mm_mul_ps(_mm_mul_ps(*_p4++,*_q4++),_mk));,
592 
593  aa[5] = _mm_add_ps(aa[5],_mm_mul_ps(_mm_mul_ps(*_p5,*_p5), _mk));
594  AA[5] = _mm_add_ps(AA[5],_mm_mul_ps(_mm_mul_ps(*_q5,*_q5), _mk));
595  aA[5] = _mm_add_ps(aA[5],_mm_mul_ps(_mm_mul_ps(*_p5++,*_q5++),_mk));,
596 
597  aa[6] = _mm_add_ps(aa[6],_mm_mul_ps(_mm_mul_ps(*_p6,*_p6), _mk));
598  AA[6] = _mm_add_ps(AA[6],_mm_mul_ps(_mm_mul_ps(*_q6,*_q6), _mk));
599  aA[6] = _mm_add_ps(aA[6],_mm_mul_ps(_mm_mul_ps(*_p6++,*_q6++),_mk));,
600 
601  aa[7] = _mm_add_ps(aa[7],_mm_mul_ps(_mm_mul_ps(*_p7,*_p7), _mk));
602  AA[7] = _mm_add_ps(AA[7],_mm_mul_ps(_mm_mul_ps(*_q7,*_q7), _mk));
603  aA[7] = _mm_add_ps(aA[7],_mm_mul_ps(_mm_mul_ps(*_p7++,*_q7++),_mk));)
604  }
605 
606 // packet amplitudes and sin/cos for detectors 0-3
607  _a = _mm_hadd_ps(aa[0],aa[1]); _A = _mm_hadd_ps(aa[2],aa[3]); _aa = _mm_hadd_ps(_a,_A);
608  _a = _mm_hadd_ps(AA[0],AA[1]); _A = _mm_hadd_ps(AA[2],AA[3]); _AA = _mm_hadd_ps(_a,_A);
609  _a = _mm_hadd_ps(aA[0],aA[1]); _A = _mm_hadd_ps(aA[2],aA[3]); _aA = _mm_hadd_ps(_a,_A);
610 
611  *_si = _mm_mul_ps(_aA,_2); // rotation 2*sin*cos*norm
612  *_co = _mm_sub_ps(_aa,_AA); // rotation (cos^2-sin^2)*norm
613  _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
614  _cc = _mm_mul_ps(*_co,*_co);
615  _ss = _mm_mul_ps(*_si,*_si);
616  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
617  *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2)); // first component amplitude
618  *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2); // second component energy
619  *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM)); // make sure |AM|>0
620  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
621  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
622  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
623  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
624  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
625  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
626 
627 // packet amplitudes and sin/cos for detectors 4-7
628  _am++; _AM++; _si++; _co++;
629  _a = _mm_hadd_ps(aa[4],aa[5]); _A = _mm_hadd_ps(aa[6],aa[7]); _aa = _mm_hadd_ps(_a,_A);
630  _a = _mm_hadd_ps(AA[4],AA[5]); _A = _mm_hadd_ps(AA[6],AA[7]); _AA = _mm_hadd_ps(_a,_A);
631  _a = _mm_hadd_ps(aA[4],aA[5]); _A = _mm_hadd_ps(aA[6],aA[7]); _aA = _mm_hadd_ps(_a,_A);
632 
633  *_si = _mm_mul_ps(_aA,_2); // rotation 2*sin*cos*norm
634  *_co = _mm_sub_ps(_aa,_AA); // rotation (cos^2-sin^2)*norm
635  _x = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
636  _cc = _mm_mul_ps(*_co,*_co);
637  _ss = _mm_mul_ps(*_si,*_si);
638  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
639  *_am = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_x,_nn),_2)); // first component amplitude
640  *_AM = _mm_div_ps(_mm_sub_ps(_x,_nn),_2); // second component energy
641  *_AM = _mm_sqrt_ps(_mm_andnot_ps(sm,*_AM)); // make sure |AM|>0
642  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
643  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
644  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
645  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
646  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
647  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
648 
649  //cout<<"packet1: "<<a[0]<<" "<<A[0]<<" "<<a[1]<<" "<<A[1]<<" "<<a[2]<<" "<<A[2]<<" ";
650  //cout<<((a[0]+A[0])*(a[0]+A[0])/2+(a[1]+A[1])*(a[1]+A[1])/2+(a[2]+A[2])*(a[2]+A[2])/2)<<"\n";
651 
652  NETX(q[0][II]=a[0]; q[0][II+1]=A[0]; q[0][II+2]=s[0]; q[0][II+3]=c[0]; q[0][II+5]=1;,
653  q[1][II]=a[1]; q[1][II+1]=A[1]; q[1][II+2]=s[1]; q[1][II+3]=c[1]; q[1][II+5]=1;,
654  q[2][II]=a[2]; q[2][II+1]=A[2]; q[2][II+2]=s[2]; q[2][II+3]=c[2]; q[2][II+5]=1;,
655  q[3][II]=a[3]; q[3][II+1]=A[3]; q[3][II+2]=s[3]; q[3][II+3]=c[3]; q[3][II+5]=1;,
656  q[4][II]=a[4]; q[4][II+1]=A[4]; q[4][II+2]=s[4]; q[4][II+3]=c[4]; q[4][II+5]=1;,
657  q[5][II]=a[5]; q[5][II+1]=A[5]; q[5][II+2]=s[5]; q[5][II+3]=c[5]; q[5][II+5]=1;,
658  q[6][II]=a[6]; q[6][II+1]=A[6]; q[6][II+2]=s[6]; q[6][II+3]=c[6]; q[6][II+5]=1;,
659  q[7][II]=a[7]; q[7][II+1]=A[7]; q[7][II+2]=s[7]; q[7][II+3]=c[7]; q[7][II+5]=1;)
660 
661  float Ep = 0;
662  NETX(q[0][II+4]=(a[0]+A[0]); Ep+=q[0][II+4]*q[0][II+4]/2; _p0 = (__m128*)p[0]; _q0 = (__m128*)q[0];,
663  q[1][II+4]=(a[1]+A[1]); Ep+=q[1][II+4]*q[1][II+4]/2; _p1 = (__m128*)p[1]; _q1 = (__m128*)q[1];,
664  q[2][II+4]=(a[2]+A[2]); Ep+=q[2][II+4]*q[2][II+4]/2; _p2 = (__m128*)p[2]; _q2 = (__m128*)q[2];,
665  q[3][II+4]=(a[3]+A[3]); Ep+=q[3][II+4]*q[3][II+4]/2; _p3 = (__m128*)p[3]; _q3 = (__m128*)q[3];,
666  q[4][II+4]=(a[4]+A[4]); Ep+=q[4][II+4]*q[4][II+4]/2; _p4 = (__m128*)p[4]; _q4 = (__m128*)q[4];,
667  q[5][II+4]=(a[5]+A[5]); Ep+=q[5][II+4]*q[5][II+4]/2; _p5 = (__m128*)p[5]; _q5 = (__m128*)q[5];,
668  q[6][II+4]=(a[6]+A[6]); Ep+=q[6][II+4]*q[6][II+4]/2; _p6 = (__m128*)p[6]; _q6 = (__m128*)q[6];,
669  q[7][II+4]=(a[7]+A[7]); Ep+=q[7][II+4]*q[7][II+4]/2; _p7 = (__m128*)p[7]; _q7 = (__m128*)q[7];)
670 
671  *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o)); _am--;
672  *_am = _mm_div_ps(_1,_mm_add_ps(*_am,_o));
673  *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o)); _AM--;
674  *_AM = _mm_div_ps(_1,_mm_add_ps(*_AM,_o));
675 
676  //cout<<"packet2: "<<a[0]<<" "<<A[0]<<" "<<a[1]<<" "<<A[1]<<" "<<a[2]<<" "<<A[2]<<" ";
677 
678 
679  NETX(aa[0]=_mm_set1_ps(a[0]); AA[0]=_mm_set1_ps(A[0]); si[0]=_mm_set1_ps(s[0]); co[0]=_mm_set1_ps(c[0]); ,
680  aa[1]=_mm_set1_ps(a[1]); AA[1]=_mm_set1_ps(A[1]); si[1]=_mm_set1_ps(s[1]); co[1]=_mm_set1_ps(c[1]); ,
681  aa[2]=_mm_set1_ps(a[2]); AA[2]=_mm_set1_ps(A[2]); si[2]=_mm_set1_ps(s[2]); co[2]=_mm_set1_ps(c[2]); ,
682  aa[3]=_mm_set1_ps(a[3]); AA[3]=_mm_set1_ps(A[3]); si[3]=_mm_set1_ps(s[3]); co[3]=_mm_set1_ps(c[3]); ,
683  aa[4]=_mm_set1_ps(a[4]); AA[4]=_mm_set1_ps(A[4]); si[4]=_mm_set1_ps(s[4]); co[4]=_mm_set1_ps(c[4]); ,
684  aa[5]=_mm_set1_ps(a[5]); AA[5]=_mm_set1_ps(A[5]); si[5]=_mm_set1_ps(s[5]); co[5]=_mm_set1_ps(c[5]); ,
685  aa[6]=_mm_set1_ps(a[6]); AA[6]=_mm_set1_ps(A[6]); si[6]=_mm_set1_ps(s[6]); co[6]=_mm_set1_ps(c[6]); ,
686  aa[7]=_mm_set1_ps(a[7]); AA[7]=_mm_set1_ps(A[7]); si[7]=_mm_set1_ps(s[7]); co[7]=_mm_set1_ps(c[7]); )
687 
688  _MK = (__m128*)(pAVX[1]);
689  for(int i=0; i<I; i+=4) { // calculate and store unit vector components
690  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1); // event mask
691  NETX(_a=_mm_add_ps(_mm_mul_ps(*_p0,co[0]),_mm_mul_ps(*_q0,si[0]));
692  _A=_mm_sub_ps(_mm_mul_ps(*_q0,co[0]),_mm_mul_ps(*_p0,si[0]));
693  *_p0++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[0]));
694  *_q0++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[0])); ,
695 
696  _a=_mm_add_ps(_mm_mul_ps(*_p1,co[1]),_mm_mul_ps(*_q1,si[1]));
697  _A=_mm_sub_ps(_mm_mul_ps(*_q1,co[1]),_mm_mul_ps(*_p1,si[1]));
698  *_p1++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[1]));
699  *_q1++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[1])); ,
700 
701  _a=_mm_add_ps(_mm_mul_ps(*_p2,co[2]),_mm_mul_ps(*_q2,si[2]));
702  _A=_mm_sub_ps(_mm_mul_ps(*_q2,co[2]),_mm_mul_ps(*_p2,si[2]));
703  *_p2++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[2]));
704  *_q2++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[2])); ,
705 
706  _a=_mm_add_ps(_mm_mul_ps(*_p3,co[3]),_mm_mul_ps(*_q3,si[3]));
707  _A=_mm_sub_ps(_mm_mul_ps(*_q3,co[3]),_mm_mul_ps(*_p3,si[3]));
708  *_p3++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[3]));
709  *_q3++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[3])); ,
710 
711  _a=_mm_add_ps(_mm_mul_ps(*_p4,co[4]),_mm_mul_ps(*_q4,si[4]));
712  _A=_mm_sub_ps(_mm_mul_ps(*_q4,co[4]),_mm_mul_ps(*_p4,si[4]));
713  *_p4++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[4]));
714  *_q4++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[4])); ,
715 
716  _a=_mm_add_ps(_mm_mul_ps(*_p5,co[5]),_mm_mul_ps(*_q5,si[5]));
717  _A=_mm_sub_ps(_mm_mul_ps(*_q5,co[5]),_mm_mul_ps(*_p5,si[5]));
718  *_p5++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[5]));
719  *_q5++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[5])); ,
720 
721  _a=_mm_add_ps(_mm_mul_ps(*_p6,co[6]),_mm_mul_ps(*_q6,si[6]));
722  _A=_mm_sub_ps(_mm_mul_ps(*_q6,co[6]),_mm_mul_ps(*_p6,si[6]));
723  *_p6++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[6]));
724  *_q6++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[6])); ,
725 
726  _a=_mm_add_ps(_mm_mul_ps(*_p7,co[7]),_mm_mul_ps(*_q7,si[7]));
727  _A=_mm_sub_ps(_mm_mul_ps(*_q7,co[7]),_mm_mul_ps(*_p7,si[7]));
728  *_p7++ = _mm_mul_ps(_mk,_mm_mul_ps(_a,aa[7]));
729  *_q7++ = _mm_mul_ps(_mk,_mm_mul_ps(_A,AA[7])); )
730  }
731  return Ep/2; // returm packet energy p[er quadrature
732 }
733 
734 static inline float _avx_setAMP_ps(float** p, float** q,
735  std::vector<float*> &pAVX, int I) {
736 // set packet amplitudes for waveform reconstruction
737 // returns number of degrees of freedom - effective # of pixels per detector
738  int II = I*2;
739  int I2 = II+2;
740  int I3 = II+3;
741  int I4 = II+4;
742  int I5 = II+5;
743  float k = pAVX[1][I]; // number of detectors
744 
745  __m128 _a, _A, _n, _mk, _nn;
746  static const __m128 _o = _mm_set1_ps(1.e-9);
747  static const __m128 _0 = _mm_set1_ps(0);
748  static const __m128 _1 = _mm_set1_ps(1);
749  static const __m128 _4 = _mm_set1_ps(4);
750  static const __m128 o5 = _mm_set1_ps(0.5);
751 
752  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; __m128* _n0 = (__m128*)(q[0]+I); ,
753  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; __m128* _n1 = (__m128*)(q[1]+I); ,
754  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; __m128* _n2 = (__m128*)(q[2]+I); ,
755  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; __m128* _n3 = (__m128*)(q[3]+I); ,
756  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; __m128* _n4 = (__m128*)(q[4]+I); ,
757  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; __m128* _n5 = (__m128*)(q[5]+I); ,
758  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; __m128* _n6 = (__m128*)(q[6]+I); ,
759  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; __m128* _n7 = (__m128*)(q[7]+I); )
760 
761  NETX(__m128 a0=_mm_set1_ps(q[0][I4]); __m128 s0=_mm_set1_ps(q[0][I2]); __m128 c0=_mm_set1_ps(q[0][I3]); ,
762  __m128 a1=_mm_set1_ps(q[1][I4]); __m128 s1=_mm_set1_ps(q[1][I2]); __m128 c1=_mm_set1_ps(q[1][I3]); ,
763  __m128 a2=_mm_set1_ps(q[2][I4]); __m128 s2=_mm_set1_ps(q[2][I2]); __m128 c2=_mm_set1_ps(q[2][I3]); ,
764  __m128 a3=_mm_set1_ps(q[3][I4]); __m128 s3=_mm_set1_ps(q[3][I2]); __m128 c3=_mm_set1_ps(q[3][I3]); ,
765  __m128 a4=_mm_set1_ps(q[4][I4]); __m128 s4=_mm_set1_ps(q[4][I2]); __m128 c4=_mm_set1_ps(q[4][I3]); ,
766  __m128 a5=_mm_set1_ps(q[5][I4]); __m128 s5=_mm_set1_ps(q[5][I2]); __m128 c5=_mm_set1_ps(q[5][I3]); ,
767  __m128 a6=_mm_set1_ps(q[6][I4]); __m128 s6=_mm_set1_ps(q[6][I2]); __m128 c6=_mm_set1_ps(q[6][I3]); ,
768  __m128 a7=_mm_set1_ps(q[7][I4]); __m128 s7=_mm_set1_ps(q[7][I2]); __m128 c7=_mm_set1_ps(q[7][I3]); )
769 
770  __m128* _MK = (__m128*)pAVX[1];
771  __m128* _fp = (__m128*)pAVX[2];
772  __m128* _fx = (__m128*)pAVX[3];
773  __m128* _ee = (__m128*)pAVX[15];
774  __m128* _EE = (__m128*)pAVX[16];
775  __m128* _gn = (__m128*)pAVX[20];
776 
777  __m128 _Np = _mm_setzero_ps(); // number of effective pixels per detector
778 
779  for(int i=0; i<I; i+=4) { // packet amplitudes
780  _mk = _mm_mul_ps(o5,_mm_and_ps(_mm_cmpgt_ps(*_MK++,_0),_1)); // event mask
781  NETX(
782  _n = _mm_mul_ps(_mm_mul_ps(a0,_mk),*_n0); _a=*_p0; _A=*_q0; _nn=*_n0++;
783  *_p0++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c0),_mm_mul_ps(_A,s0)));
784  *_q0++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c0),_mm_mul_ps(_a,s0))); ,
785  _n = _mm_mul_ps(_mm_mul_ps(a1,_mk),*_n1); _a=*_p1; _A=*_q1; _nn=_mm_add_ps(_nn,*_n1++);
786  *_p1++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c1),_mm_mul_ps(_A,s1)));
787  *_q1++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c1),_mm_mul_ps(_a,s1))); ,
788  _n = _mm_mul_ps(_mm_mul_ps(a2,_mk),*_n2); _a=*_p2; _A=*_q2; _nn=_mm_add_ps(_nn,*_n2++);
789  *_p2++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c2),_mm_mul_ps(_A,s2)));
790  *_q2++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c2),_mm_mul_ps(_a,s2))); ,
791  _n = _mm_mul_ps(_mm_mul_ps(a3,_mk),*_n3); _a=*_p3; _A=*_q3; _nn=_mm_add_ps(_nn,*_n3++);
792  *_p3++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c3),_mm_mul_ps(_A,s3)));
793  *_q3++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c3),_mm_mul_ps(_a,s3))); ,
794  _n = _mm_mul_ps(_mm_mul_ps(a4,_mk),*_n4); _a=*_p4; _A=*_q4; _nn=_mm_add_ps(_nn,*_n4++);
795  *_p4++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c4),_mm_mul_ps(_A,s4)));
796  *_q4++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c4),_mm_mul_ps(_a,s4))); ,
797  _n = _mm_mul_ps(_mm_mul_ps(a5,_mk),*_n5); _a=*_p5; _A=*_q5; _nn=_mm_add_ps(_nn,*_n5++);
798  *_p5++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c5),_mm_mul_ps(_A,s5)));
799  *_q5++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c5),_mm_mul_ps(_a,s5))); ,
800  _n = _mm_mul_ps(_mm_mul_ps(a6,_mk),*_n6); _a=*_p6; _A=*_q6; _nn=_mm_add_ps(_nn,*_n6++);
801  *_p6++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c6),_mm_mul_ps(_A,s6)));
802  *_q6++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c6),_mm_mul_ps(_a,s6))); ,
803  _n = _mm_mul_ps(_mm_mul_ps(a7,_mk),*_n7); _a=*_p7; _A=*_q7; _nn=_mm_add_ps(_nn,*_n7++);
804  *_p7++ = _mm_mul_ps(_n,_mm_sub_ps(_mm_mul_ps(_a,c7),_mm_mul_ps(_A,s7)));
805  *_q7++ = _mm_mul_ps(_n,_mm_add_ps(_mm_mul_ps(_A,c7),_mm_mul_ps(_a,s7))); )
806 
807  _nn = _mm_mul_ps(_nn,_mk);
808  _Np = _mm_add_ps(_Np,_nn); // Dof * k/4
809  }
810  return _wat_hsum(_Np)*4/k;
811 }
812 
813 static inline float _avx_ort_ps(float** p, float** q,
814  std::vector<float*> &pAVX, int I) {
815 // orthogonalize data vectors p and q
816 // calculate norms of orthogonal vectors and rotation sin & cos
817 // p/q - array of pointers to 00/90 phase detector data
818 // returns signal energy
819 // in likelihoodWP these arrays should be stored exactly in the same order.
820 // this function increments pointers stored in tne input pointer arrays.
821  __m128 _a,_A,_aa,_AA,_aA,_et,_cc,_ss,_nn,_cn,_sn,_mk;
822 
823  static const __m128 _0 = _mm_set1_ps(0);
824  static const __m128 _1 = _mm_set1_ps(1);
825  static const __m128 _2 = _mm_set1_ps(2);
826  static const __m128 _o = _mm_set1_ps(1.e-21);
827  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
828 
829  __m128* _MK = (__m128*)pAVX[1];
830  __m128* _si = (__m128*)pAVX[4];
831  __m128* _co = (__m128*)pAVX[5];
832  __m128* _ee = (__m128*)pAVX[15];
833  __m128* _EE = (__m128*)pAVX[16];
834  __m128 _e = _mm_setzero_ps();
835  __m128 _E = _mm_setzero_ps();
836 
837  NETX(__m128* _p0 = (__m128*)p[0]; __m128* _q0 = (__m128*)q[0]; ,
838  __m128* _p1 = (__m128*)p[1]; __m128* _q1 = (__m128*)q[1]; ,
839  __m128* _p2 = (__m128*)p[2]; __m128* _q2 = (__m128*)q[2]; ,
840  __m128* _p3 = (__m128*)p[3]; __m128* _q3 = (__m128*)q[3]; ,
841  __m128* _p4 = (__m128*)p[4]; __m128* _q4 = (__m128*)q[4]; ,
842  __m128* _p5 = (__m128*)p[5]; __m128* _q5 = (__m128*)q[5]; ,
843  __m128* _p6 = (__m128*)p[6]; __m128* _q6 = (__m128*)q[6]; ,
844  __m128* _p7 = (__m128*)p[7]; __m128* _q7 = (__m128*)q[7]; )
845 
846  for(int i=0; i<I; i+=4) {
847  NETX(
848  _aa=_mm_mul_ps(*_p0,*_p0);
849  _AA=_mm_mul_ps(*_q0,*_q0);
850  _aA=_mm_mul_ps(*_p0++,*_q0++); ,
851 
852  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p1,*_p1));
853  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q1,*_q1));
854  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p1++,*_q1++)); ,
855 
856  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p2,*_p2));
857  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q2,*_q2));
858  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p2++,*_q2++)); ,
859 
860  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p3,*_p3));
861  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q3,*_q3));
862  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p3++,*_q3++)); ,
863 
864  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p4,*_p4));
865  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q4,*_q4));
866  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p4++,*_q4++)); ,
867 
868  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p5,*_p5));
869  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q5,*_q5));
870  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p5++,*_q5++)); ,
871 
872  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p6,*_p6));
873  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q6,*_q6));
874  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p6++,*_q6++)); ,
875 
876  _aa = _mm_add_ps(_aa,_mm_mul_ps(*_p7,*_p7));
877  _AA = _mm_add_ps(_AA,_mm_mul_ps(*_q7,*_q7));
878  _aA = _mm_add_ps(_aA,_mm_mul_ps(*_p7++,*_q7++)); )
879 
880 // calculate data network orthogonalization sin and cos
881 
882  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1); // event mask
883  *_si = _mm_mul_ps(_aA,_2); // rotation 2*sin*cos*norm
884  *_co = _mm_sub_ps(_aa,_AA); // rotation (cos^2-sin^2)*norm
885  _et = _mm_add_ps(_mm_add_ps(_aa,_AA),_o); // total energy
886  _cc = _mm_mul_ps(*_co,*_co);
887  _ss = _mm_mul_ps(*_si,*_si);
888  _nn = _mm_sqrt_ps(_mm_add_ps(_cc,_ss)); // co/si norm
889  *_ee = _mm_div_ps(_mm_add_ps(_et,_nn),_2); // first component energy
890  *_EE = _mm_div_ps(_mm_sub_ps(_et,_nn),_2); // second component energy
891  _cc = _mm_div_ps(*_co,_mm_add_ps(_nn,_o)); // cos(2p)
892  _nn = _mm_and_ps(_mm_cmpgt_ps(*_si,_0),_1); // 1 if sin(2p)>0. or 0 if sin(2p)<0.
893  _ss = _mm_sub_ps(_mm_mul_ps(_2,_nn),_1); // 1 if sin(2p)>0. or-1 if sin(2p)<0.
894  *_si = _mm_sqrt_ps(_mm_div_ps(_mm_sub_ps(_1,_cc),_2)); // |sin(p)|
895  *_co = _mm_sqrt_ps(_mm_div_ps(_mm_add_ps(_1,_cc),_2)); // |cos(p)|
896  *_co = _mm_mul_ps(*_co,_ss); // cos(p)
897 
898  _e = _mm_add_ps(_e,_mm_mul_ps(_mk,*_ee));
899  _E = _mm_add_ps(_E,_mm_mul_ps(_mk,*_EE));
900  _si++; _co++; _MK++; _ee++; _EE++;
901  }
902  return _wat_hsum(_e)+_wat_hsum(_E);
903 }
904 
905 
906 static inline __m256 _avx_stat_ps(float** x, float** X,
907  float** s, float** S,
908  std::vector<float*> &pAVX, int I) {
909 // returns coherent statistics in the format {cc,ec,ed,gn}
910 // ei - incoherent energy: sum{s[i]^4}/|s|^2 + sum{S[i]^4}/|S|^2
911 // ec - coherent energy: |s|^2+|S|^2 - ei
912 // ed - energy disbalance: 0.5*sum_k{(x[k]*s[k]-s[k]*s[k])^2} * (s,s)/(x,s)^2
913 // + its 90-degrees phase value
914 // cc - reduced network correlation coefficient
915 // p/q - pointers to data/signal amplitudes
916 // nIFO - number of detectors.
917 // I - number of pixels
918  float k = pAVX[1][I]; // number of detectors
919  __m128 _a,_A,_x,_X,_c,_C,_s,_S,_r,_R,_rr,_RR,_xs,_XS;
920  __m128 _ll,_ei,_cc,_mk,_mm,_ss,_SS;
921  static const __m128 _o = _mm_set1_ps(0.001);
922  static const __m128 _0 = _mm_set1_ps(0);
923  static const __m128 _1 = _mm_set1_ps(1);
924  static const __m128 _2 = _mm_set1_ps(2);
925  static const __m128 _k = _mm_set1_ps(2*(1-k));
926  static const __m128 sm = _mm_set1_ps(-0.f); // -0.f = 1 << 31
927 
928  __m128* _et = (__m128*)pAVX[0];
929  __m128* _MK = (__m128*)pAVX[1];
930  __m128* _si = (__m128*)pAVX[4];
931  __m128* _co = (__m128*)pAVX[5];
932  __m128* _ec = (__m128*)pAVX[19];
933  __m128* _gn = (__m128*)pAVX[20];
934  __m128* _ed = (__m128*)pAVX[21];
935  __m128* _rn = (__m128*)pAVX[22];
936 
937  __m128 _LL = _mm_setzero_ps();
938  __m128 _Lr = _mm_setzero_ps();
939  __m128 _EC = _mm_setzero_ps();
940  __m128 _GN = _mm_setzero_ps();
941  __m128 _RN = _mm_setzero_ps();
942  __m128 _NN = _mm_setzero_ps();
943 
944  NETX(__m128* _x0 = (__m128*)x[0]; __m128* _X0 = (__m128*)X[0]; ,
945  __m128* _x1 = (__m128*)x[1]; __m128* _X1 = (__m128*)X[1]; ,
946  __m128* _x2 = (__m128*)x[2]; __m128* _X2 = (__m128*)X[2]; ,
947  __m128* _x3 = (__m128*)x[3]; __m128* _X3 = (__m128*)X[3]; ,
948  __m128* _x4 = (__m128*)x[4]; __m128* _X4 = (__m128*)X[4]; ,
949  __m128* _x5 = (__m128*)x[5]; __m128* _X5 = (__m128*)X[5]; ,
950  __m128* _x6 = (__m128*)x[6]; __m128* _X6 = (__m128*)X[6]; ,
951  __m128* _x7 = (__m128*)x[7]; __m128* _X7 = (__m128*)X[7]; )
952 
953  NETX(__m128* _s0 = (__m128*)s[0]; __m128* _S0 = (__m128*)S[0]; ,
954  __m128* _s1 = (__m128*)s[1]; __m128* _S1 = (__m128*)S[1]; ,
955  __m128* _s2 = (__m128*)s[2]; __m128* _S2 = (__m128*)S[2]; ,
956  __m128* _s3 = (__m128*)s[3]; __m128* _S3 = (__m128*)S[3]; ,
957  __m128* _s4 = (__m128*)s[4]; __m128* _S4 = (__m128*)S[4]; ,
958  __m128* _s5 = (__m128*)s[5]; __m128* _S5 = (__m128*)S[5]; ,
959  __m128* _s6 = (__m128*)s[6]; __m128* _S6 = (__m128*)S[6]; ,
960  __m128* _s7 = (__m128*)s[7]; __m128* _S7 = (__m128*)S[7]; )
961 
962  for(int i=0; i<I; i+=4) {
963  NETX(
964  _s=_mm_add_ps(_mm_mul_ps(*_s0,*_co),_mm_mul_ps(*_S0,*_si)); _r=_mm_sub_ps(*_s0,*_x0);
965  _x=_mm_add_ps(_mm_mul_ps(*_x0,*_co),_mm_mul_ps(*_X0,*_si)); _R=_mm_sub_ps(*_S0,*_X0);
966  _S=_mm_sub_ps(_mm_mul_ps(*_S0++,*_co),_mm_mul_ps(*_s0++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_a;
967  _X=_mm_sub_ps(_mm_mul_ps(*_X0++,*_co),_mm_mul_ps(*_x0++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_A;
968  _c=_mm_mul_ps(_a,_a); _ss=_mm_mul_ps(_s,_s); _rr=_mm_mul_ps(_r,_r);
969  _C=_mm_mul_ps(_A,_A); _SS=_mm_mul_ps(_S,_S); _RR=_mm_mul_ps(_R,_R); ,
970 
971  _s=_mm_add_ps(_mm_mul_ps(*_s1,*_co), _mm_mul_ps(*_S1,*_si)); _r=_mm_sub_ps(*_s1,*_x1);
972  _x=_mm_add_ps(_mm_mul_ps(*_x1,*_co), _mm_mul_ps(*_X1,*_si)); _R=_mm_sub_ps(*_S1,*_X1);
973  _S=_mm_sub_ps(_mm_mul_ps(*_S1++,*_co),_mm_mul_ps(*_s1++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
974  _X=_mm_sub_ps(_mm_mul_ps(*_X1++,*_co),_mm_mul_ps(*_x1++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
975  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
976  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
977 
978  _s=_mm_add_ps(_mm_mul_ps(*_s2,*_co), _mm_mul_ps(*_S2,*_si)); _r=_mm_sub_ps(*_s2,*_x2);
979  _x=_mm_add_ps(_mm_mul_ps(*_x2,*_co), _mm_mul_ps(*_X2,*_si)); _R=_mm_sub_ps(*_S2,*_X2);
980  _S=_mm_sub_ps(_mm_mul_ps(*_S2++,*_co),_mm_mul_ps(*_s2++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
981  _X=_mm_sub_ps(_mm_mul_ps(*_X2++,*_co),_mm_mul_ps(*_x2++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
982  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
983  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
984 
985  _s=_mm_add_ps(_mm_mul_ps(*_s3,*_co), _mm_mul_ps(*_S3,*_si)); _r=_mm_sub_ps(*_s3,*_x3);
986  _x=_mm_add_ps(_mm_mul_ps(*_x3,*_co), _mm_mul_ps(*_X3,*_si)); _R=_mm_sub_ps(*_S3,*_X3);
987  _S=_mm_sub_ps(_mm_mul_ps(*_S3++,*_co),_mm_mul_ps(*_s3++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
988  _X=_mm_sub_ps(_mm_mul_ps(*_X3++,*_co),_mm_mul_ps(*_x3++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
989  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
990  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
991 
992  _s=_mm_add_ps(_mm_mul_ps(*_s4,*_co), _mm_mul_ps(*_S4,*_si)); _r=_mm_sub_ps(*_s4,*_x4);
993  _x=_mm_add_ps(_mm_mul_ps(*_x4,*_co), _mm_mul_ps(*_X4,*_si)); _R=_mm_sub_ps(*_S4,*_X4);
994  _S=_mm_sub_ps(_mm_mul_ps(*_S4++,*_co),_mm_mul_ps(*_s4++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
995  _X=_mm_sub_ps(_mm_mul_ps(*_X4++,*_co),_mm_mul_ps(*_x4++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
996  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
997  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
998 
999  _s=_mm_add_ps(_mm_mul_ps(*_s5,*_co), _mm_mul_ps(*_S5,*_si)); _r=_mm_sub_ps(*_s5,*_x5);
1000  _x=_mm_add_ps(_mm_mul_ps(*_x5,*_co), _mm_mul_ps(*_X5,*_si)); _R=_mm_sub_ps(*_S5,*_X5);
1001  _S=_mm_sub_ps(_mm_mul_ps(*_S5++,*_co),_mm_mul_ps(*_s5++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1002  _X=_mm_sub_ps(_mm_mul_ps(*_X5++,*_co),_mm_mul_ps(*_x5++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1003  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1004  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1005 
1006  _s=_mm_add_ps(_mm_mul_ps(*_s6,*_co), _mm_mul_ps(*_S6,*_si)); _r=_mm_sub_ps(*_s6,*_x6);
1007  _x=_mm_add_ps(_mm_mul_ps(*_x6,*_co), _mm_mul_ps(*_X6,*_si)); _R=_mm_sub_ps(*_S6,*_X6);
1008  _S=_mm_sub_ps(_mm_mul_ps(*_S6++,*_co),_mm_mul_ps(*_s6++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1009  _X=_mm_sub_ps(_mm_mul_ps(*_X6++,*_co),_mm_mul_ps(*_x6++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1010  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1011  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); ,
1012 
1013  _s=_mm_add_ps(_mm_mul_ps(*_s7,*_co), _mm_mul_ps(*_S7,*_si)); _r=_mm_sub_ps(*_s7,*_x7);
1014  _x=_mm_add_ps(_mm_mul_ps(*_x7,*_co), _mm_mul_ps(*_X7,*_si)); _R=_mm_sub_ps(*_S7,*_X7);
1015  _S=_mm_sub_ps(_mm_mul_ps(*_S7++,*_co),_mm_mul_ps(*_s7++,*_si)); _a=_mm_mul_ps(_s,_x); _xs=_mm_add_ps(_xs,_a);
1016  _X=_mm_sub_ps(_mm_mul_ps(*_X7++,*_co),_mm_mul_ps(*_x7++,*_si)); _A=_mm_mul_ps(_S,_X); _XS=_mm_add_ps(_XS,_A);
1017  _c=_mm_add_ps(_c,_mm_mul_ps(_a,_a)); _ss=_mm_add_ps(_ss,_mm_mul_ps(_s,_s)); _rr=_mm_add_ps(_rr,_mm_mul_ps(_r,_r));
1018  _C=_mm_add_ps(_C,_mm_mul_ps(_A,_A)); _SS=_mm_add_ps(_SS,_mm_mul_ps(_S,_S)); _RR=_mm_add_ps(_RR,_mm_mul_ps(_R,_R)); )
1019 
1020 
1021  _mk = _mm_and_ps(_mm_cmpge_ps(*_MK,_0),_1); // event mask
1022  _c = _mm_div_ps(_c,_mm_add_ps(_mm_mul_ps(_xs,_xs),_o)); // first component incoherent energy
1023  _C = _mm_div_ps(_C,_mm_add_ps(_mm_mul_ps(_XS,_XS),_o)); // second component incoherent energy
1024  _ll = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS)); // signal energy
1025  _ss = _mm_mul_ps(_ss,_mm_sub_ps(_1,_c)); // 00 coherent energy
1026  _SS = _mm_mul_ps(_SS,_mm_sub_ps(_1,_C)); // 90 coherent energy
1027  *_ec = _mm_mul_ps(_mk,_mm_add_ps(_ss,_SS)); // coherent energy
1028  *_gn = _mm_mul_ps(_mk,_mm_mul_ps(_2,*_MK)); // G-noise correction
1029  *_rn = _mm_mul_ps(_mk,_mm_add_ps(_rr,_RR)); // residual noise in TF domain
1030 
1031  _a = _mm_mul_ps(_2,_mm_andnot_ps(sm,*_ec)); // 2*|ec|
1032  _A = _mm_add_ps(*_rn,_mm_add_ps(_o,*_gn)); // NULL
1033  _cc = _mm_div_ps(*_ec,_mm_add_ps(_a,_A)); // correlation coefficient
1034  _Lr = _mm_add_ps(_Lr,_mm_mul_ps(_ll,_cc)); // reduced likelihood
1035  _mm = _mm_and_ps(_mm_cmpgt_ps(*_ec,_o),_1); // coherent energy mask
1036 
1037  _LL = _mm_add_ps(_LL, _ll); // total signal energy
1038  _GN = _mm_add_ps(_GN,*_gn); // total G-noise correction
1039  _EC = _mm_add_ps(_EC,*_ec); // total coherent energy
1040  _RN = _mm_add_ps(_RN,*_rn); // residual noise in TF domain
1041  _NN = _mm_add_ps(_NN, _mm); // number of pixel in TF domain with Ec>0
1042 
1043  _si++; _co++; _MK++; _ec++; _gn++; _ed++; _rn++;
1044  }
1045  float cc = _wat_hsum(_Lr)/(_wat_hsum(_LL)+0.001); // network correlation coefficient
1046  float ec = _wat_hsum(_EC); // total coherent energy
1047  float rn = _wat_hsum(_GN)+_wat_hsum(_RN); // total noise x 2
1048  float nn = _wat_hsum(_NN); // number of pixels
1049  float ch = rn/(k*nn+sqrt(nn))/2; // chi2 in TF domain
1050  return _mm256_set_ps(0,0,0,0,rn/2,nn,ec,2*cc); // reversed order
1051 }
1052 
1053 
1054 static inline __m256 _avx_noise_ps(float** p, float** q, std::vector<float*> &pAVX, int I) {
1055 // q - pointer to pixel norms
1056 // returns noise correction
1057 // I - number of pixels
1058  float k = pAVX[1][I]; // number of detectors
1059  __m128 _nx,_ns,_nm,_mk,_gg,_rc,_nn;
1060  __m128* _et = (__m128*)pAVX[0];
1061  __m128* _MK = (__m128*)pAVX[1];
1062  __m128* _ec = (__m128*)pAVX[19];
1063  __m128* _gn = (__m128*)pAVX[20];
1064  __m128* _rn = (__m128*)pAVX[22];
1065  __m128 _GN = _mm_setzero_ps();
1066  __m128 _RC = _mm_setzero_ps();
1067  __m128 _ES = _mm_setzero_ps();
1068  __m128 _EH = _mm_setzero_ps();
1069  __m128 _EC = _mm_setzero_ps();
1070  __m128 _SC = _mm_setzero_ps();
1071  __m128 _NC = _mm_setzero_ps();
1072  __m128 _NS = _mm_setzero_ps();
1073 
1074  static const __m128 _0 = _mm_set1_ps(0);
1075  static const __m128 o5 = _mm_set1_ps(0.5);
1076  static const __m128 _o = _mm_set1_ps(1.e-9);
1077  static const __m128 _1 = _mm_set1_ps(1);
1078  static const __m128 _2 = _mm_set1_ps(2);
1079  static const __m128 _k = _mm_set1_ps(1./k);
1080 
1081  NETX(__m128* _s0 = (__m128*)(p[0]+I);, __m128* _s1 = (__m128*)(p[1]+I);,
1082  __m128* _s2 = (__m128*)(p[2]+I);, __m128* _s3 = (__m128*)(p[3]+I);,
1083  __m128* _s4 = (__m128*)(p[4]+I);, __m128* _s5 = (__m128*)(p[5]+I);,
1084  __m128* _s6 = (__m128*)(p[6]+I);, __m128* _s7 = (__m128*)(p[7]+I);)
1085 
1086  NETX(__m128* _x0 = (__m128*)(q[0]+I);, __m128* _x1 = (__m128*)(q[1]+I);,
1087  __m128* _x2 = (__m128*)(q[2]+I);, __m128* _x3 = (__m128*)(q[3]+I);,
1088  __m128* _x4 = (__m128*)(q[4]+I);, __m128* _x5 = (__m128*)(q[5]+I);,
1089  __m128* _x6 = (__m128*)(q[6]+I);, __m128* _x7 = (__m128*)(q[7]+I);)
1090 
1091  for(int i=0; i<I; i+=4) {
1092  NETX(_ns =*_s0++;, _ns = _mm_add_ps(*_s1++,_ns); ,
1093  _ns = _mm_add_ps(*_s2++,_ns);, _ns = _mm_add_ps(*_s3++,_ns); ,
1094  _ns = _mm_add_ps(*_s4++,_ns);, _ns = _mm_add_ps(*_s5++,_ns); ,
1095  _ns = _mm_add_ps(*_s6++,_ns);, _ns = _mm_add_ps(*_s7++,_ns); )
1096 
1097  NETX(_nx =*_x0++;, _nx = _mm_add_ps(*_x1++,_nx); ,
1098  _nx = _mm_add_ps(*_x2++,_nx);, _nx = _mm_add_ps(*_x3++,_nx); ,
1099  _nx = _mm_add_ps(*_x4++,_nx);, _nx = _mm_add_ps(*_x5++,_nx); ,
1100  _nx = _mm_add_ps(*_x6++,_nx);, _nx = _mm_add_ps(*_x7++,_nx); )
1101 
1102  _ns = _mm_mul_ps(_ns,_k);
1103  _nx = _mm_mul_ps(_nx,_k);
1104  _mk = _mm_and_ps(_mm_cmpgt_ps(*_MK,_0),_1); // event mask
1105  _nm = _mm_and_ps(_mm_cmpgt_ps(_nx,_0),_1); // data norm mask
1106  _nm = _mm_mul_ps(_mk,_nm); // norm x event mask
1107  _EC = _mm_add_ps(_EC,_mm_mul_ps(_nm,*_ec)); // coherent energy
1108  _NC = _mm_add_ps(_NC,_nm); // number of core pixels
1109 
1110  _nm = _mm_sub_ps(_mk,_nm); // halo mask
1111  _ES = _mm_add_ps(_ES,_mm_mul_ps(_nm,*_rn)); // residual sattelite noise
1112 
1113  _rc = _mm_and_ps(_mm_cmplt_ps(*_gn,_2),_1); // rc=1 if gn<2, or rc=0 if gn>=2
1114  _nn = _mm_mul_ps(*_gn,_mm_sub_ps(_1,_rc)); // _nn = [_1 - _rc] * _gn
1115  _rc = _mm_add_ps(_rc,_mm_mul_ps(_nn,o5)); // Ec normalization
1116  _rc = _mm_div_ps(*_ec,_mm_add_ps(_rc,_o)); // normalized EC
1117 
1118  _nm = _mm_and_ps(_mm_cmpgt_ps(_ns,_0),_1); // signal norm mask
1119  _nm = _mm_mul_ps(_mk,_nm); // norm x event mask
1120  *_gn = _mm_mul_ps(_mk,_mm_mul_ps(*_gn,_nx)); // normalize Gaussian noise ecorrection
1121  _SC = _mm_add_ps(_SC,_mm_mul_ps(_nm,*_ec)); // signal coherent energy
1122  _RC = _mm_add_ps(_RC,_mm_mul_ps(_nm, _rc)); // total normalized EC
1123  _GN = _mm_add_ps(_GN,_mm_mul_ps(_nm,*_gn)); // G-noise correction in time domain
1124  _NS = _mm_add_ps(_NS,_nm); // number of signal pixels
1125 
1126  _nm = _mm_sub_ps(_mk,_nm); // satellite mask
1127  _EH = _mm_add_ps(_EH,_mm_mul_ps(_nm,*_et)); // halo energy in TF domain
1128 
1129  _MK++; _gn++; _et++; _ec++; _rn++;
1130  }
1131  float es = _wat_hsum(_ES)/2; // residual satellite energy in time domain
1132  float eh = _wat_hsum(_EH)/2; // halo energy in TF domain
1133  float gn = _wat_hsum(_GN); // G-noise correction
1134  float nc = _wat_hsum(_NC); // number of core pixels
1135  float ns = _wat_hsum(_NS); // number of signal pixels
1136  float rc = _wat_hsum(_RC); // normalized EC x 2
1137  float ec = _wat_hsum(_EC); // signal coherent energy x 2
1138  float sc = _wat_hsum(_SC); // core coherent energy x 2
1139  return _mm256_set_ps(ns,nc,es,eh,rc/(sc+0.01),sc-ec,ec,gn);
1140 }
1141 
1142 
1143 // vec.push_back(m[j]*(fp*(a+A)*(u[j]*cu-U[j]*su)/N[0] + fx*(b+B)*(V[j]*cv+v[j]*sv)/N[1]));
1144 // vec.push_back(m[j]*(fp*(a+A)*(U[j]*cu+u[j]*su)/N[0] + fx*(b+B)*(v[j]*cv-V[j]*sv)/N[1]));
1145 
1146 
1147 #endif // WATAVX_HH
1148 
1149 
1150 
1151 
1152 
1153 
1154 
1155 
1156 
1157 
1158 
1159 
1160 
1161 
1162 
1163 
1164 
detector D
Definition: TestBandPass.C:15
tuple f
Definition: cwb_online.py:91
static float _avx_dpf_ps(double **Fp, double **Fx, int l, std::vector< float * > &pAPN, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:65
wavearray< double > a(hp.size())
WSeries< float > v[nIFO]
Definition: cwb_net.C:62
int n
Definition: cwb_net.C:10
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\t layers : "<< nLAYERS<< "\t dF(hz) : "<< dF<< "\t dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1)*itime+ifreq;double time=itime *dT;double freq=(ifreq >0)?ifreq *dF:dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
i drho i
#define N
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
static float _avx_loadata_ps(float **p, float **q, float **u, float **v, float En, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:412
static __m256 _avx_noise_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:1054
wavearray< double > h
Definition: Regression_H1.C:25
TCanvas * c1
static __m256 _avx_stat_ps(float **x, float **X, float **s, float **S, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:906
static void _avx_free_ps(std::vector< float * > &v)
Definition: watavx.hh:21
static float _avx_setAMP_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:734
static void _avx_loadNULL_ps(float **n, float **N, float **d, float **D, float **h, float **H, int I)
Definition: watavx.hh:485
int k
static double A
Definition: geodesics.cc:8
static float _avx_ort_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:813
static float _avx_GW_ps(float **p, float **q, std::vector< float * > &pAPN, float *rr, std::vector< float * > &pAVX, int II)
Definition: watavx.hh:232
double e
static void _avx_cpf_ps(float **p, float **q, float **u, float **v, int I)
Definition: watavx.hh:30
s s
Definition: cwb_net.C:137
static double a2
Definition: geodesics.cc:8
static float _wat_hsum(__m128 y)
Definition: watavx.hh:25
Meyer< double > S(1024, 2)
int l
Definition: cbc_plots.C:434
TCanvas * c2
Definition: slag.C:207
wavearray< double > y
Definition: Test10.C:31
static float _avx_packet_ps(float **p, float **q, std::vector< float * > &pAVX, int I)
Definition: watavx.hh:532
char c3[8]