library/html/watsse_8hh_source.html

 // Wavelet Analysis Tool

 // S.Klimenko, University of Florida

 // library of general functions


 #ifndef WATSSE_HH

 #define WATSSE_HH


 #include <xmmintrin.h>

 #include <pmmintrin.h>

 #include <iostream>

 #include <stdlib.h>

 #include <math.h>

 #include <TMath.h>

 #include "wat.hh"


 // WAT SSE functions


 static inline void _sse_print_ps(__m128* _p) {

    float x[4];

    _NET(_mm_storeu_ps(x,_p[0]); printf("%e %e %e %e ",x[0],x[1],x[2],x[3]);,

         _mm_storeu_ps(x,_p[1]); printf("%e %e %e %e ",x[0],x[1],x[2],x[3]);)

       cout<<endl;

 }


 static inline void _sse_zero_ps(__m128* _p) {

    _NET(_p[0] = _mm_setzero_ps();,

         _p[1] = _mm_setzero_ps();)

       return;

 }


 static inline void _sse_load_ps(__m128* _p, float a) {

    _NET(_p[0] = _mm_load1_ps(&a);,

         _p[1] = _mm_load1_ps(&a);)

       return;

 }


 static inline void _sse_mul_ps(__m128* _a, float b) {

   __m128 _b = _mm_load1_ps(&b);

   _NET(_a[0] = _mm_mul_ps(_a[0],_b);,

        _a[1] = _mm_mul_ps(_a[1],_b);)

 }


 static inline void _sse_mul_ps(__m128* _a, __m128* _b) {

   _NET(_a[0] = _mm_mul_ps(_a[0],_b[0]);,

        _a[1] = _mm_mul_ps(_a[1],_b[1]);)

 }


 static inline float _sse_mul_ps(__m128* _a, __m128* _b, __m128* _o) {

    float x[4];

    float out;

   _NET(_o[0] = _mm_mul_ps(_a[0],_b[0]);_mm_storeu_ps(x,_o[0]); out = XSUM(x);,

        _o[1] = _mm_mul_ps(_a[1],_b[1]);_mm_storeu_ps(x,_o[1]); out+= YSUM(x);)

       return out;

 }


 static inline void _sse_mul4_ps(__m128* _am, __m128 _c) {

    float c[4]; _mm_storeu_ps(c,_c);

   __m128* _a = _am;

    _NET(_c=_mm_load1_ps( c );*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)

    _NET(_c=_mm_load1_ps(c+1);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)

    _NET(_c=_mm_load1_ps(c+2);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)

    _NET(_c=_mm_load1_ps(c+3);*_a=_mm_mul_ps(*_a,_c);_a++;, *_a=_mm_mul_ps(*_a,_c);_a++;)

 }


 static inline void _sse_hard4_ps(__m128* _uu, __m128* _am, __m128* _AM, __m128 _c) {

 // calculate hard responses

 // replace standard with hard responses if _c=0;

 // uu - unity vector along f+

 // am, AM - 00 and 90 phase responses.

    float c[4]; _mm_storeu_ps(c,_c);

   __m128* _u = _uu;

   __m128* _a = _am;

   __m128* _A = _AM;

   __m128 _r, _R;

   _NET(

        _r = _mm_set1_ps(c[0]); _R = _mm_set1_ps(1-c[0]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;

        )

   _NET(

        _r = _mm_set1_ps(c[1]); _R = _mm_set1_ps(1-c[1]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;

        )

   _NET(

        _r = _mm_set1_ps(c[2]); _R = _mm_set1_ps(1-c[2]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;

        )

   _NET(

        _r = _mm_set1_ps(c[3]); _R = _mm_set1_ps(1-c[3]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_r),_mm_mul_ps(_mm_mul_ps(*_u++,*_a),_R));_a++;*_A = _mm_mul_ps(*_A,_r);_A++;

        )

 }


 static inline void _sse_ifcp4_ps(__m128* _aa, __m128* _bb,  __m128 _c) {

 // sabstutute vector _aa with _bb  if _c=0;

    float c[4]; _mm_storeu_ps(c,_c);

   __m128* _a = _aa;

   __m128* _b = _bb;

   __m128 _1, _0;

   _NET(_1 = _mm_set1_ps(c[0]); _0 = _mm_set1_ps(1-c[0]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)

   _NET(_1 = _mm_set1_ps(c[1]); _0 = _mm_set1_ps(1-c[1]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)

   _NET(_1 = _mm_set1_ps(c[2]); _0 = _mm_set1_ps(1-c[2]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)

   _NET(_1 = _mm_set1_ps(c[3]); _0 = _mm_set1_ps(1-c[3]);

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;,

        *_a = _mm_add_ps(_mm_mul_ps(*_a,_1),_mm_mul_ps(*_b++,_0));_a++;)

 }


 static inline float _sse_abs_ps(__m128* _a) {

   float x[4];

   float out;

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);

        )

      return out;

 }


 static inline float _sse_abs_ps(__m128* _a, __m128* _A) {

   float x[4];

   float out;

   _NET(

        _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]))); out = XSUM(x);,

        _mm_storeu_ps(x,_mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]))); out+= YSUM(x);

        )

      return out;

 }


 static inline __m128 _sse_abs4_ps(__m128* _p) {

 // return |p|^2

   float x[4];

   float o[4];

   __m128* _a = _p;

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);

        )

      return _mm_load_ps(o);

 }


 static inline __m128 _sse_div4_ps(__m128* _v, __m128* _u) {

 // returns |v|/|u|

    return _mm_sqrt_ps(_mm_div_ps(_sse_abs4_ps(_v),

              _mm_add_ps(_sse_abs4_ps(_u),

                    _mm_set1_ps(1.e-24))));

 }


 static inline __m128 _sse_rnorm4_ps(__m128* _p) {

 // return reciprocical norm: 1/|p|

   float x[4];

   float o[4];

   __m128* _a = _p;

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0] = XSUM(x)+1.e-24;,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[0]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1] = XSUM(x)+1.e-24;,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[1]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2] = XSUM(x)+1.e-24;,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[2]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3] = XSUM(x)+1.e-24;,

        _mm_storeu_ps(x,_mm_mul_ps(*_a,*_a));_a++;o[3]+= YSUM(x);

        )

      return _mm_div_ps(_mm_set1_ps(1.),_mm_sqrt_ps(_mm_load_ps(o)));

 }


 static inline float _sse_dot_ps(__m128* _a, __m128* _b) {

   float x[4];

   float out;

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(_a[0],_b[0])); out = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(_a[1],_b[1])); out+= YSUM(x);

        )

      return out;

 }


 static inline __m128 _sse_dot4_ps(__m128* _p, __m128* _q) {

   float x[4];

   float o[4];

   __m128* _o = (__m128*) o;

   __m128* _a = _p;

   __m128* _b = _q;

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[0]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[1]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[2]+= YSUM(x);

        )

   _NET(

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3] = XSUM(x);,

        _mm_storeu_ps(x,_mm_mul_ps(*_a++,*_b++));o[3]+= YSUM(x);

        )

      return *_o;

 }


 static inline void _sse_add_ps(__m128* _a, __m128* _b) {

 // _a += _b

    _NET(_a[0] = _mm_add_ps(_a[0],_b[0]);,

         _a[1] = _mm_add_ps(_a[1],_b[1]);)

       return;

 }


 static inline void _sse_add_ps(__m128* _a, __m128* _b, __m128 _c) {

 // _a += _b *_c

    _NET(_a[0] = _mm_add_ps(_a[0],_mm_mul_ps(_b[0],_c));,

         _a[1] = _mm_add_ps(_a[1],_mm_mul_ps(_b[1],_c));)

       return;

 }


 static inline void _sse_add4_ps(__m128* _a, __m128* _b, __m128 _c) {

 // _a++ += _b++ *c[0]

 // _a++ += _b++ *c[1]

 // _a++ += _b++ *c[2]

 // _a++ += _b++ *c[3]

    float c[4]; _mm_storeu_ps(c,_c);

    __m128* _p = _a;

    __m128* _q = _b;

    _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,

    *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)

    _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,

    *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)

    _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,

    *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)

    _NET(*_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,

    *_p = _mm_add_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)

    return;

 }


 static inline void _sse_sub_ps(__m128* _a, __m128* _b) {

 // _a -= _b

    _NET(_a[0] = _mm_sub_ps(_a[0],_b[0]);,

         _a[1] = _mm_sub_ps(_a[1],_b[1]);)

       return;

 }


 static inline void _sse_sub4_ps(__m128* _a, __m128* _b, __m128 _c) {

 // _a++ -= _b++ *c[0]

 // _a++ -= _b++ *c[1]

 // _a++ -= _b++ *c[2]

 // _a++ -= _b++ *c[3]

    float c[4]; _mm_storeu_ps(c,_c);

    __m128* _p = _a;

    __m128* _q = _b;

    _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;,

    *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps( c ))); _p++;)

    _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;,

    *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+1))); _p++;)

    _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;,

    *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+2))); _p++;)

    _NET(*_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;,

    *_p = _mm_sub_ps(*_p,_mm_mul_ps(*_q++,_mm_load1_ps(c+3))); _p++;)

    return;

 }


 static inline void _sse_cpf_ps(float* a, __m128* _p) {

    _NET(_mm_storeu_ps(a,*_p);,

         _mm_storeu_ps(a+4,*(_p+1));)

       return;

 }


 static inline void _sse_cpf_ps(__m128* _a, __m128* _p) {

    _NET(*_a = *_p;, *(_a+1) = *(_p+1);)

 }


 static inline void _sse_cpf4_ps(__m128* _aa, __m128* _pp) {

    __m128* _a = _aa;

    __m128* _p = _pp;

    _NET(*_a++ = *_p++;, *_a++ = *_p++;)

    _NET(*_a++ = *_p++;, *_a++ = *_p++;)

    _NET(*_a++ = *_p++;, *_a++ = *_p++;)

    _NET(*_a++ = *_p++;, *_a++ = *_p++;)

 }


 static inline void _sse_cpf_ps(float* a, __m128* _p, float b) {

   __m128 _b = _mm_load1_ps(&b);

   _NET(_mm_storeu_ps(a,_mm_mul_ps(*_p,_b));,

        _mm_storeu_ps(a+4,_mm_mul_ps(*(_p+1),_b));)

      return;

 }


 static inline void _sse_cpf4_ps(float* aa, __m128* _pp) {

 // copy data from pp to aa for 4 consecutive pixels

    float*   a =  aa;

    __m128* _p = _pp;


   _NET(_mm_storeu_ps(a,*_p++); a+=4;,

        _mm_storeu_ps(a,*_p++); a+=4;)

   _NET(_mm_storeu_ps(a,*_p++); a+=4;,

        _mm_storeu_ps(a,*_p++); a+=4;)

   _NET(_mm_storeu_ps(a,*_p++); a+=4;,

        _mm_storeu_ps(a,*_p++); a+=4;)

   _NET(_mm_storeu_ps(a,*_p++); a+=4;,

        _mm_storeu_ps(a,*_p++); a+=4;)


      return;

 }


 static inline void _sse_cpf4_ps(__m128* _aa, __m128* _pp, __m128 _c) {

 // multiply p by c and copy data from p to a for 4 consecutive pixels

 // calculate _a[0]=_p[0]*c[0]

 // calculate _a[1]=_p[1]*c[1]

 // calculate _a[2]=_p[2]*c[2]

 // calculate _a[3]=_p[3]*c[3]

    float c[4]; _mm_storeu_ps(c,_c);

    __m128* _a = _aa;

    __m128* _p = _pp;


    _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));,

         *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps( c ));)

    _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));,

         *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+1));)

    _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));,

         *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+2));)

    _NET(*_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));,

         *_a++ = _mm_mul_ps(*_p++,_mm_load1_ps(c+3));)

      return;

 }


 static inline float _sse_nrg_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {

 // calculate b = (a - u*c - v*s) and return b*b

   float x[4];

   float out;

   __m128 _b;

   __m128 _c = _mm_load1_ps(&c);

   __m128 _s = _mm_load1_ps(&s);


   _NET(_b = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(*_u,_c), _mm_mul_ps(*_v,_s)));

        _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out = XSUM(x);,

        _b = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(*(_u+1),_c), _mm_mul_ps(*(_v+1), _s)));

        _mm_storeu_ps(x,_mm_mul_ps(_b,_b)); out+= YSUM(x);)


   return out/2.;

 }


 static inline void _sse_rotadd_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {

 // calculate a += u*c + v*s

   __m128 _c = _mm_load1_ps(&c);

   __m128 _s = _mm_load1_ps(&s);

   _NET(

        _a[0] = _mm_add_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));,

        _a[1] = _mm_add_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1],_s)));

        )

      return;

 }


 static inline float _sse_rotsub_ps(__m128* _u, float c, __m128* _v, float s, __m128* _a) {

 // calculate a -= u*c + v*s and return a*a

   float x[4];

   float out;

   __m128 _c = _mm_load1_ps(&c);

   __m128 _s = _mm_load1_ps(&s);


   _NET(

        _a[0] = _mm_sub_ps(_a[0], _mm_add_ps(_mm_mul_ps(_u[0],_c), _mm_mul_ps(_v[0],_s)));

        _mm_storeu_ps(x,_mm_mul_ps(_a[0],_a[0])); out = XSUM(x);,

        _a[1] = _mm_sub_ps(_a[1], _mm_add_ps(_mm_mul_ps(_u[1],_c), _mm_mul_ps(_v[1], _s)));

        _mm_storeu_ps(x,_mm_mul_ps(_a[1],_a[1])); out+= YSUM(x);

        )


   return out;

 }


 static inline void _sse_rotp_ps(__m128* u, float* c, __m128* v, float* s, __m128* a) {

 // calculate a = u*c + v*s

   _NET(

        a[0] = _mm_add_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,

        a[1] = _mm_add_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));

        )

 }


 static inline void _sse_rotm_ps(__m128* u, float* c, __m128* v, float* s, __m128* a) {

 // calculate a = u*c - v*s

   _NET(

        a[0] = _mm_sub_ps(_mm_mul_ps(u[0],_mm_load1_ps(c)), _mm_mul_ps(v[0],_mm_load1_ps(s)));,

        a[1] = _mm_sub_ps(_mm_mul_ps(u[1],_mm_load1_ps(c)), _mm_mul_ps(v[1],_mm_load1_ps(s)));

        )

 }


 static inline __m128 _sse_rotp_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {

 // return u*c + v*s

    return _mm_add_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));

 }


 static inline __m128 _sse_rotm_ps(__m128 _u, __m128 _c, __m128 _v, __m128 _s) {

 // return u*c - v*s

    return _mm_sub_ps(_mm_mul_ps(_u,_c), _mm_mul_ps(_v,_s));

 }


 static inline void _sse_rot4p_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {

 // calculate a[0] = u[0]*c[0] + v[0]*s[0]

 // calculate a[1] = u[1]*c[1] + v[1]*s[1]

 // calculate a[2] = u[2]*c[2] + v[2]*s[2]

 // calculate a[3] = u[3]*c[3] + v[3]*s[3]

    float c[4];

    float s[4];

    _mm_storeu_ps(c,*_c);

    _mm_storeu_ps(s,*_s);

    __m128* u = _u;

    __m128* v = _v;

    __m128* a = _a;

   _NET(

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));

        )

   _NET(

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));

        )

   _NET(

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));

        )

   _NET(

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,

        *a++ = _mm_add_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));

        )

 }


 static inline void _sse_rot4m_ps(__m128* _u, __m128* _c, __m128* _v, __m128* _s, __m128* _a) {

 // calculate a[0] = u[0]*c[0] - v[0]*s[0]

 // calculate a[1] = u[1]*c[1] - v[1]*s[1]

 // calculate a[2] = u[2]*c[2] - v[2]*s[2]

 // calculate a[3] = u[3]*c[3] - v[3]*s[3]

    float c[4];

    float s[4];

    _mm_storeu_ps(c,*_c);

    _mm_storeu_ps(s,*_s);

    __m128* u = _u;

    __m128* v = _v;

    __m128* a = _a;

   _NET(

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));,

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps( c )), _mm_mul_ps(*v++,_mm_load1_ps( s )));

        )

   _NET(

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));,

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+1)), _mm_mul_ps(*v++,_mm_load1_ps(s+1)));

        )

   _NET(

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));,

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+2)), _mm_mul_ps(*v++,_mm_load1_ps(s+2)));

        )

   _NET(

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));,

        *a++ = _mm_sub_ps(_mm_mul_ps(*u++,_mm_load1_ps(c+3)), _mm_mul_ps(*v++,_mm_load1_ps(s+3)));

        )

 }


 static inline void _sse_point_ps(__m128** _p, float** p, short** m, int l, int n) {

 // point 0-7 __m128 pointers to first network pixel

    NETX(_p[0] = (__m128*) (p[0] + m[0][l]*n);,

         _p[1] = (__m128*) (p[1] + m[1][l]*n);,

         _p[2] = (__m128*) (p[2] + m[2][l]*n);,

         _p[3] = (__m128*) (p[3] + m[3][l]*n);,

         _p[4] = (__m128*) (p[4] + m[4][l]*n);,

         _p[5] = (__m128*) (p[5] + m[5][l]*n);,

         _p[6] = (__m128*) (p[6] + m[6][l]*n);,

         _p[7] = (__m128*) (p[7] + m[7][l]*n);)

       return;

 }


 static inline __m128 _sse_sum_ps(__m128** _p) {

    __m128 _q = _mm_setzero_ps();

    NETX(_q = _mm_add_ps(_q, *_p[0]);,

         _q = _mm_add_ps(_q, *_p[1]);,

         _q = _mm_add_ps(_q, *_p[2]);,

         _q = _mm_add_ps(_q, *_p[3]);,

         _q = _mm_add_ps(_q, *_p[4]);,

         _q = _mm_add_ps(_q, *_p[5]);,

         _q = _mm_add_ps(_q, *_p[6]);,

         _q = _mm_add_ps(_q, *_p[7]);)

       return _q;

 }


 static inline __m128 _sse_cut_ps(__m128* _pE, __m128** _pe, __m128 _Es, __m128 _cmp) {

    NETX(_cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[0]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[1]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[2]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[3]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[4]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[5]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[6]++),_Es));,

         _cmp = _mm_and_ps(_cmp,_mm_cmpge_ps(_mm_sub_ps(*_pE, *_pe[7]++),_Es));)

       return _cmp;

 }


 static inline void _sse_minSNE_ps(__m128* _pE, __m128** _pe, __m128* _es) {

 // put pixel minimum subnetwork energy in _es

 // input _es should be initialized to _pE before call

    NETX(*_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[0]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[1]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[2]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[3]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[4]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[5]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[6]++));,

         *_es = _mm_min_ps(*_es,_mm_sub_ps(*_pE, *_pe[7]++));

         )

 }


 static inline float _sse_maxE_ps(__m128* _a, __m128* _A) {

 // given input 00 and 90 data vectors

 // returns energy of dominant detector (max energy)

    float out;

    float x[4];

    __m128 _o1;

    __m128 _o2 = _mm_setzero_ps();

    _NET(

         _o1 = _mm_add_ps(_mm_mul_ps(_a[0],_a[0]),_mm_mul_ps(_A[0],_A[0]));,

         _o2 = _mm_add_ps(_mm_mul_ps(_a[1],_a[1]),_mm_mul_ps(_A[1],_A[1]));

         )

    _o1 = _mm_max_ps(_o1,_o2); _mm_storeu_ps(x,_o1); out=x[0];

    if(out<x[1]) out=x[1];

    if(out<x[2]) out=x[2];

    if(out<x[3]) out=x[3];

    return out;

 }


 static inline void _sse_ort4_ps(__m128* _u, __m128* _v, __m128* _s, __m128* _c) {

 // orthogonalize vectors _u and _v: take vectors u and v,

 // make them orthogonal, calculate rotation phase

 // fill in sin and cos in _s and _c respectively

    static const __m128 sm = _mm_set1_ps(-0.f);                           // -0.f = 1 << 31

    static const __m128 _o = _mm_set1_ps(1.e-24);

    static const __m128 _0 = _mm_set1_ps(0.);

    static const __m128 _1 = _mm_set1_ps(1.);

    static const __m128 _2 = _mm_set1_ps(2.);

    __m128 _n,_m,gI,gR,_p,_q;

    gI = _mm_mul_ps(_sse_dot4_ps(_u,_v),_2);                              // ~sin(2*psi) or 2*u*v

    gR = _mm_sub_ps(_sse_dot4_ps(_u,_u),_sse_dot4_ps(_v,_v));             // u^2-v^2

    _p = _mm_and_ps(_mm_cmpge_ps(gR,_0),_1);                              // 1 if gR>0. or 0 if gR<0.

    _q = _mm_sub_ps(_1,_p);                                               // 0 if gR>0. or 1 if gR<0.

    _n = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gI,gI),_mm_mul_ps(gR,gR)));    // gc

    gR = _mm_add_ps(_mm_andnot_ps(sm,gR),_mm_add_ps(_n,_o));              // gc+|gR|+eps

    _n = _mm_add_ps(_mm_mul_ps(_2,_n),_o);                                // 2*gc + eps

    gI = _mm_div_ps(gI,_n);                                               // sin(2*psi)

    _n = _mm_sqrt_ps(_mm_div_ps(gR,_n));                                  // sqrt((gc+|gR|)/(2gc+eps))

    _m = _mm_and_ps(_mm_cmpge_ps(gI,_0),_1);                              // 1 if gI>0. or 0 if gI<0.

    _m = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_m,_2),_1),_n);                 // _n if gI>0 or -_n if gI<0

   *_s = _mm_add_ps(_mm_mul_ps(_q,_m),_mm_mul_ps(_p,_mm_div_ps(gI,_n)));  // sin(psi)

    gI = _mm_andnot_ps(sm,gI);                                            // |gI|

   *_c = _mm_add_ps(_mm_mul_ps(_p,_n),_mm_mul_ps(_q,_mm_div_ps(gI,_n)));  // cos(psi)

    return;

 }


 static inline void _sse_ort4_ps(__m128* _s, __m128* _c, __m128 _r) {

 // solves equation sin(2a) * _c - cos(2a) * _s = 0

 // input:  _s ~ sin(2a), _c ~ cos(2a) (not normalized), _r - regulator

 // regularized  _C = |_c| + _r*eps

 // norm:        _n =  sqrt(_s*_s+_C*_C)

 //         cos(2a) = _C/_n

 //         sin(2a) = _s/_n

 // output  _s = sin(a),  _c = cos(a)

 // algorithm:

 // _c>0:   cos(a) = sqrt([1+cos(2a)]/2);           sin(a) = sin(2a)/2/cos(a)

 // _c<0:   sin(a) = sign[_s]*sqrt([1+cos(2a)]/2);  cos(a) = |sin(2a)|/2/|sin(a)|

    static const __m128 sm = _mm_set1_ps(-0.f);                           // -0.f = 1 << 31

    static const __m128 _0 = _mm_set1_ps(0.);

    static const __m128 _5 = _mm_set1_ps(0.5);

    static const __m128 _1 = _mm_set1_ps(1.);

    static const __m128 _2 = _mm_set1_ps(2.);

    __m128 _n,_m,_C,_S,_p,_q;

    _r = _mm_mul_ps(_mm_add_ps(_1,_r),_mm_set1_ps(1.e-6));                // regulator

    _m = _mm_and_ps(_mm_cmpge_ps(*_s,_0),_1);                             // 1 if _S>0. or 0 if _S<0.

    _m = _mm_sub_ps(_mm_mul_ps(_m,_2),_1);                                // sign(_s)

    _p = _mm_and_ps(_mm_cmpge_ps(*_c,_0),_1);                             // 1 if _c>0. or 0 if _c<0.

    _q = _mm_sub_ps(_1,_p);                                               // 0 if _c>0. or 1 if _c<0.

    _C = _mm_add_ps(_mm_andnot_ps(sm,*_c),_r);                            // |_c|+regulator

    _n = _mm_add_ps(_mm_mul_ps(*_s,*_s),_mm_mul_ps(_C,_C));               // norm^2 for sin(2a) and cos(2a)

    _n = _mm_div_ps(_1,_mm_mul_ps(_mm_sqrt_ps(_n),_2));                   // 0.5/norm for sin(2a) and cos(2a)

    _C = _mm_sqrt_ps(_mm_add_ps(_5,_mm_mul_ps(_C,_n)));                   // |cos(a)|

    _S = _mm_div_ps(_mm_mul_ps(*_s,_n),_C);                               // sin(a)*cos(a)/|cos(a)|

   *_s = _mm_add_ps(_mm_mul_ps(_p,_S),_mm_mul_ps(_q,_mm_mul_ps(_C,_m)));  // sin(psi)

   *_c = _mm_add_ps(_mm_mul_ps(_p,_C),_mm_mul_ps(_q,_mm_mul_ps(_S,_m)));  // cos(psi)

    return;

 }


 static inline void _sse_dpf4_ps(__m128* _Fp, __m128* _Fx, __m128* _fp, __m128* _fx) {

 // transformation to DPF for 4 consecutive pixels.

 // rotate vectors Fp and Fx into DPF: fp and fx

    __m128 _c, _s;

    _sse_ort4_ps(_Fp,_Fx,&_s,&_c);                                        // get sin and cos

    _sse_rot4p_ps(_Fp,&_c,_Fx,&_s,_fp);                                   // get fp=Fp*c+Fx*s

    _sse_rot4m_ps(_Fx,&_c,_Fp,&_s,_fx);                                   // get fx=Fx*c-Fp*s

    return;

 }


 static inline void _sse_pnp4_ps(__m128* _fp, __m128* _fx, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {

 // projection to network plane (pnp)

 // _fp and _fx must be in DPF

 // project vectors _am and _AM on the network plane _fp,_fx

 // returns square of the network alignment factor

 // c = xp/gp - cos of rotation to PCF

 // s = xx/gx - sin of rotation to PCF

    static const __m128 _o = _mm_set1_ps(1.e-24);

    static const __m128 _1 = _mm_set1_ps(1.0);

    __m128 gp = _mm_div_ps(_1,_mm_add_ps(_sse_dot4_ps(_fp,_fp),_o)); // 1/fp*fp

    _sse_sub4_ps(_fx,_fp,_mm_mul_ps(gp,_sse_dot4_ps(_fp,_fx)));      // check fx _|_ to fp

    __m128 gx = _mm_div_ps(_1,_mm_add_ps(_sse_dot4_ps(_fx,_fx),_o)); // 1/fx*fx

    __m128 cc = _mm_mul_ps(_sse_dot4_ps(_fp,_am),gp);                // cos

    __m128 ss = _mm_mul_ps(_sse_dot4_ps(_fx,_am),gx);                // sin

    _sse_rot4p_ps(_fp,&cc,_fx,&ss,_u);                               // get vector u

           cc = _mm_mul_ps(_sse_dot4_ps(_fp,_AM),gp);                // cos

           ss = _mm_mul_ps(_sse_dot4_ps(_fx,_AM),gx);                // sin

    _sse_rot4p_ps(_fp,&cc,_fx,&ss,_v);                               // get vector v

    return;

 }


 static inline void _sse_dsp4_ps(__m128* u, __m128* v, __m128* _am, __m128* _AM, __m128* _u, __m128* _v) {

 // dual stream phase (dsp) transformation

 // take projection vectors uu and vu,

 // make them orthogonal, calculate dual stream phase

 // apply phase transformation both to data and projections

    __m128 _c, _s;

    _sse_ort4_ps(u,v,&_s,&_c);            // get sin and cos

    _sse_rot4p_ps(u,&_c,v,&_s,_u);        // get 00 response

    _sse_rot4m_ps(v,&_c,u,&_s,_v);        // get 90 response

    _sse_rot4p_ps(_am,&_c,_AM,&_s,u);     // get 00 data vector

    _sse_rot4m_ps(_AM,&_c,_am,&_s,v);     // get 90 data vector

    _sse_cpf4_ps(_am,u);

    _sse_cpf4_ps(_AM,v);

    return;

 }


 static inline __m128 _sse_ei4_ps(__m128* _u, __m128 _L) {

 // returns incoherent energy for vector u

 // calculates sum: u[k]*u[k]*u[k]*u[k]/L

 // where L should be |u|^2

   float x[4];

   float o[4];

   __m128* _a = _u;

   __m128  _c;

   _NET(

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[0]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[1]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[2]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));_a++;o[3]+= YSUM(x);

        )

      return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));

 }


 static inline __m128 _sse_ei4xx_ps(__m128* _x, __m128* _u, __m128 _L) {

 // returns incoherent energy for vectors p,q

 // _x - data vector

 // _u - projection vector on the network plane

 // calculates sum: x[k]*x[k]*u[k]*u[k]/L

   float x[4];

   float o[4];

   __m128* _a = _x;

   __m128* _b = _u;

   __m128  _c;

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);

        )

      return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));

 }


 static inline __m128 _sse_ei4xu_ps(__m128* _x, __m128* _u, __m128 _L) {

 // returns incoherent energy for vectors p,q

 // _x - data vector

 // _u - projection vector on the network plane

 // calculates sum: x[k]*u[k]*u[k]*u[k]/L

   float x[4];

   float o[4];

   __m128* _a = _x;

   __m128* _b = _u;

   __m128  _c;

   _NET(

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0] = XSUM(x);,

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[0]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1] = XSUM(x);,

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[1]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2] = XSUM(x);,

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[2]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3] = XSUM(x);,

        _c = _mm_mul_ps(*_b,*_b);_mm_storeu_ps(x,_mm_mul_ps(_c,_mm_mul_ps(*_a++,*_b++)));o[3]+= YSUM(x);

        )

      return _mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12)));

 }


 static inline __m128 _sse_null4_ps(__m128* _p, __m128* _q) {

 // returns global NULL energy: |E-L-Ei+Li| + |Ei-Li|

 // E = |p|^2; Ei = sum{p[k]^4}/|p|^2

 // L = |q|^2; Li = sum{q[k]^4}/|q|^2

   __m128 _sm = _mm_set1_ps(-0.f);            // sign mask: -0.f = 1 << 31

   __m128 _pe = _sse_abs4_ps(_p);             // get total energy for _p

   __m128 _pi = _sse_ei4_ps(_p,_pe);          // get incoherent energy for _p

   __m128 _qe = _sse_abs4_ps(_q);             // get total energy for _q

   __m128 _qi = _sse_ei4_ps(_q,_qe);          // get incoherent energy for _q

   _pi = _mm_sub_ps(_pi,_qi);                 // Ei-Li

   _pe = _mm_sub_ps(_mm_sub_ps(_pe,_qe),_pi); // (E-L) - (Ei-Li)

   return _mm_add_ps(_mm_andnot_ps(_sm,_pi),_mm_andnot_ps(_sm,_pe));

 }


 static inline __m128 _sse_ecoh4_ps(__m128* _p, __m128* _q, __m128 _L) {

 // returns coherent energy given

 // _p - data vector

 // _q - network response

 // calculates vector: u[k] = p[k]*q[k]

 // returns:           L - u[k]*u[k]/L,

 // where L should be (q,q)^2

   float x[4];

   float o[4];

   __m128* _a = _p;

   __m128* _b = _q;

   __m128  _c;

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);

        )

      return _mm_sub_ps(_L,_mm_div_ps(_mm_load_ps(o),_mm_add_ps(_L,_mm_set1_ps(1.e-12))));

 }


 static inline __m128 _sse_ind4_ps(__m128* _p, __m128 _L) {

 // returns vector index

 // _p - data vector  and L is its norm^2

 // calculates vector: u[k] = p[k]*p[k]

 // returns:           u[k]*u[k]/L/L,

   float x[4];

   float o[4];

   __m128* _a = _p;

   __m128  _c;

   _NET(

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,

        _c = _mm_mul_ps(*_a,*_a); _a++; _mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);

        )

      _c = _mm_add_ps(_mm_mul_ps(_L,_L),_mm_set1_ps(1.e-16));

      return _mm_div_ps(_mm_load_ps(o),_c);

 }


 static inline __m128 _sse_ecoh4_ps(__m128* _p, __m128* _q) {

 // returns coherent part of (p,q)^2

 // calculates reduced unity vector: u[k] = q * |q| /(p,q)

 // returns:  (p,u)^2 - sum_k{p[k]*p[k]*u[k]*u[k]}

 //      =    |q|^2 - p[k]^2 * q[k]^2 * |q|^2/(p,q)^2

 //      =    |q|^2 (1 - sum_k{p[k]^2*q[k]^2}/(p,q)^2

   float x[4];

   float o[4];

   __m128* _a = _p;

   __m128* _b = _q;

   __m128  _c;

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[0]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[1]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[2]+= YSUM(x);

        )

   _NET(

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3] = XSUM(x);,

        _c = _mm_mul_ps(*_a++,*_b++);_mm_storeu_ps(x,_mm_mul_ps(_c,_c));o[3]+= YSUM(x);

        )


   _c = _sse_dot4_ps(_p,_q);                                   // scalar product

   _c = _mm_add_ps(_mm_set1_ps(1.e-12),_mm_mul_ps(_c,_c));     // (p,q)^2+1.e-12

   _c = _mm_div_ps(_mm_load_ps(o),_c);                         // get incoherent part


   return _mm_mul_ps(_sse_abs4_ps(_q),_mm_sub_ps(_mm_set1_ps(1.),_c));

 }


 static inline __m128 _sse_ed4_ps(__m128* _p, __m128* _q, __m128 _L) {

 // returns energy disbalance

 // _p - data vector

 // _q - network response

 // calculates sum: 0.5*(p[k]*q[k]-q[k]*q[k])^2  / L

   float x[4];

   float o[4];

   __m128* _a = _p;

   __m128* _b = _q;

   __m128  _aa;

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);

        )

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);

        )

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);

        )

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);

        )

      _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5));

      return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.e-12)));

 }


 static inline __m128 _sse_ed4_ps(__m128* _p, __m128* _q) {

 // returns energy disbalance

 // _p - data vector

 // _q - network response (may not be a projection)

 // calculates sum: 0.5*sum_k{(p[k]*q[k]-q[k]*q[k])^2} * (q,q)/(p,q)^2

   float x[4];

   float o[4];

   __m128* _a = _p;

   __m128* _b = _q;

   __m128  _aa;

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[0]+= YSUM(x);

        )

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[1]+= YSUM(x);

        )

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[2]+= YSUM(x);

        )

   _NET(

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3] = XSUM(x);,

        _aa=_mm_sub_ps(_mm_mul_ps(*_a,*_b),_mm_mul_ps(*_b,*_b));

        _mm_storeu_ps(x,_mm_mul_ps(_aa,_aa));_a++;_b++;o[3]+= YSUM(x);

        )

   _aa = _sse_dot4_ps(_p,_q);                                   // scalar product

   _aa = _mm_add_ps(_mm_set1_ps(1.e-12),_mm_mul_ps(_aa,_aa));   // (p,q)^2+1.e-12

   _aa = _mm_div_ps(_sse_abs4_ps(_q),_aa);                      // (q,q)/(p,q)^2


   return _mm_mul_ps(_aa,_mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(0.5)));

 }


 static inline __m128 _sse_ed4i_ps(__m128* _p, __m128* _q, __m128 _L) {

 // returns incoherent part of energy disbalance

 // _p - projection amplitude on the network

 // calculates vector: v[k] = p[k]*p[k] * (q[k]*q[k]-p[k]*p[k])

 // returns:           Sum_k{v[k]/L}

   float x[4];

   float o[4];

   __m128* _a = _p;

   __m128* _b = _q;

   __m128  _aa,_bb;

   _NET(

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[0] = XSUM(x);,

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[0]+= YSUM(x);

        )

   _NET(

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[1] = XSUM(x);,

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[1]+= YSUM(x);

        )

   _NET(

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[2] = XSUM(x);,

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[2]+= YSUM(x);

        )

   _NET(

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[3] = XSUM(x);,

        _aa=_mm_mul_ps(*_a,*_b); _bb=_mm_mul_ps(*_b,*_b);

        _mm_storeu_ps(x,_mm_mul_ps(_bb,_mm_sub_ps(_aa,_bb)));

        _a++; _b++; o[3]+= YSUM(x);

        )

      _aa = _mm_mul_ps(_mm_load_ps(o),_mm_set1_ps(2.));

      return _mm_div_ps(_aa,_mm_add_ps(_L,_mm_set1_ps(1.e-12)));

 }


 static inline __m128 _sse_like4_ps(__m128* _f, __m128* _a, __m128* _A) {

 // input ff - antenna pattern (f+ or fx) in DPF

 // input am,AM - network amplitude vectors

 // returns: (xp*xp+XP*XP)/|f+|^2 or (xx*xx+XX*XX)/|fx|^2

    __m128 xx = _sse_dot4_ps(_f,_a);                                 // fp*am

    __m128 XX = _sse_dot4_ps(_f,_A);                                 // fp*AM

           xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));     // xx=xx*xx+XX*XX

    return _mm_div_ps(xx,_mm_add_ps(_sse_dot4_ps(_f,_f),_mm_set1_ps(1.e-12)));

 }


 static inline __m128 _sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D) {

 // input fp,fx - antenna patterns in DPF

 // input am,AM - network amplitude vectors

 // input _D - (delta) - hard regulator

 // returns: (xp*xp+XP*XP)/|f+|^2+(xx*xx+XX*XX)/(|fx|^2+delta)

    __m128 xp = _sse_dot4_ps(fp,am);                                 // fp*am

    __m128 XP = _sse_dot4_ps(fp,AM);                                 // fp*AM

    __m128 xx = _sse_dot4_ps(fx,am);                                 // fx*am

    __m128 XX = _sse_dot4_ps(fx,AM);                                 // fx*AM

    __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-12)); // fx*fx + epsilon

    __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_D);                  // fx*fx + delta

           xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));     // xp=xp*xp+XP*XP

           xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));     // xx=xx*xx+XX*XX

    return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx));          // regularized projected energy

 }


 static inline __m128 _sse_like4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {

 // input fp,fx - antenna patterns in DPF

 // input am,AM - network amplitude vectors

 // returns: (xp*xp+XP*XP)/|f+|^2+(xx*xx+XX*XX)/(|fx|^2)

    __m128 xp = _sse_dot4_ps(fp,am);                                 // fp*am

    __m128 XP = _sse_dot4_ps(fp,AM);                                 // fp*AM

    __m128 xx = _sse_dot4_ps(fx,am);                                 // fx*am

    __m128 XX = _sse_dot4_ps(fx,AM);                                 // fx*AM

    __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-12)); // fx*fx + epsilon

    __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_mm_set1_ps(1.e-12)); // fx*fx + epsilon

           xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));     // xp=xp*xp+XP*XP

           xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));     // xx=xx*xx+XX*XX

    return _mm_add_ps(_mm_div_ps(xp,gp),_mm_div_ps(xx,gx));          // regularized projected energy

 }


 static inline __m128 _sse_like4w_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM) {

 // input fp,fx - antenna patterns in DPF

 // input am,AM - network amplitude vectors

 // returns: [(xp*xp+XP*XP)+(xx*xx+XX*XX)]/|f+|^2

    __m128 xp = _sse_dot4_ps(fp,am);                                 // fp*am

    __m128 XP = _sse_dot4_ps(fp,AM);                                 // fp*AM

    __m128 xx = _sse_dot4_ps(fx,am);                                 // fx*am

    __m128 XX = _sse_dot4_ps(fx,AM);                                 // fx*AM

    __m128 gp = _mm_add_ps(_sse_dot4_ps(fp,fp),_mm_set1_ps(1.e-9));  // fp*fp + epsilon

           xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));     // xp=xp*xp+XP*XP

           xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));     // xx=xx*xx+XX*XX

    return _mm_div_ps(_mm_add_ps(xp,xx),gp);                         // regularized projected energy

 }


 static inline __m128 _sse_like4_ps(__m128* am, __m128* AM) {

 // input am,AM - network projection vectors

    return _mm_add_ps(_mm_add_ps(_sse_abs4_ps(am),_sse_abs4_ps(AM)),_mm_set1_ps(1.e-12));

 }


 static inline __m128 _sse_reg4x_ps(__m128 _L, __m128* fx, __m128* am, __m128* AM, __m128 _D) {

 // x regulator (incoherent)

 // input _L - non-regulated likelihood

 // input fx - antenna pattern in DPF

 // input am,AM - network amplitude vectors

 // input _D - (delta) - regulator

 // returns: (delta*Lx/L-|fx^2|)/|fx^2+delta|

    static const __m128 _o = _mm_set1_ps(1.e-12);

    __m128 FF = _mm_add_ps(_sse_dot4_ps(fx,fx),_o);                  // fx*fx

    __m128 xx = _sse_dot4_ps(fx,am);                                 // fx*am

    __m128 XX = _sse_dot4_ps(fx,AM);                                 // fx*AM

           xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));     // xx=xx*xx+XX*XX

           xx = _mm_div_ps(_mm_mul_ps(xx,_D),_mm_mul_ps(_L,FF));     // (Lx/L)*delta

    return _mm_div_ps(_mm_sub_ps(FF,xx),_mm_add_ps(FF,_D));          // [|fx|^2-(Lx/L)*delta]/|fx^2+delta|

 }


 static inline __m128 _sse_nind4_ps(__m128* _am, __m128* _AM) {

 // calculates network index

 // input fx - antenna pattern in DPF

 // input am,AM - network projection vectors

    __m128 _ll = _sse_abs4_ps(_am);                            // L00

    __m128 _LL = _sse_abs4_ps(_AM);                            // L00

    __m128 _ei = _sse_ei4_ps(_am,_ll);                         // 00 incoherent

    __m128 _EI = _sse_ei4_ps(_AM,_LL);                         // 90 incoherent

    _ll = _mm_add_ps(_mm_add_ps(_ll,_LL),_mm_set1_ps(1.e-12)); // standard likelihood

    return _mm_div_ps(_mm_add_ps(_ei,_EI),_ll);                // network index

 }


 static inline void _sse_pol4_ps(__m128* _fp, __m128* _fx, __m128* _v, double* r, double* a) {

 // calculates the polar coordinates of the input vector v in the DPF frame

 // input _fp,_fx - antenna patterns in DPF

 // input _v      - network projection vector pointer

 // input r,a     - polar coordinates (r : radius, a : angle in radians)

 //                 r,a are pointers to 4 dimensional float arrays


    __m128 _ss,_cc;

    __m128 _oo = _mm_set1_ps(1.e-12);

    float rpol[4],cpol[4],spol[4];


    _cc = _sse_abs4_ps(_fp);                       // |fp|^2

    _cc = _mm_add_ps(_mm_sqrt_ps(_cc),_oo);        // |fp|

    _cc = _mm_div_ps(_sse_dot4_ps(_fp,_v),_cc);    // (fp,v)/|fp|

    _mm_storeu_ps(cpol,_cc);


    _ss = _sse_abs4_ps(_fx);                       // |fx|^2

    _ss = _mm_add_ps(_mm_sqrt_ps(_ss),_oo);        // |fx|

    _ss = _mm_div_ps(_sse_dot4_ps(_fx,_v),_ss);    // (fx,v)/|fx|

    _mm_storeu_ps(spol,_ss);


    _mm_storeu_ps(rpol,_sse_abs4_ps(_v));          // |v|^2


    for(int n=0;n<4;n++) {

      r[n] = sqrt(rpol[n]);                        // |v|

      a[n] = atan2(spol[n],cpol[n]);               // atan2(spol,cpol)

    }


    return;

 }


 /*

 static inline __m128 _sse_reg4_ps(__m128* fp, __m128* fx, __m128* am, __m128* AM, __m128 _D, __m128 _G) {

 // input fp,fx - antenna patterns in DPF

 // input am,AM - network amplitude vectors

 // input _D - (delta) - Tikhonov constraint

 // input _G - (gamma) - polarization constraint

 // calculate S^2 = sin(psi)^2, where psi is the angle between f+ and (xi+XI).

 // xi is the projection of x00 (_am) on the f+,fx plane

 // XI is the projection of x90 (_AM) on the f+,fx plane

 // is asymmetry s/(s+S)<T, return 0  otherwise return |xi|^2+|XI|^2

 // where  s^2 = 4*|f+|*|fx|/(|f+|^2+|fx|^2)^2 and

 // and    S^2 = 1-(|f+,xi|-|f+,XI|)^2/|f+|^2/(x00^2+x90^2)

 // the actual condition checked: s^2 < G*S^2


    static const __m128 sm = _mm_set1_ps(-0.f);                      // sign mask: -0.f = 1 << 31

    static const __m128 _1 = _mm_set1_ps(1);

    static const __m128 _2 = _mm_set1_ps(2);

    static const __m128 _o = _mm_set1_ps(1.e-12);                    // epsilon

    __m128 xp = _sse_dot4_ps(fp,am);                                 // fp*am

    __m128 XP = _sse_dot4_ps(fp,AM);                                 // fp*AM

    __m128 xx = _sse_dot4_ps(fx,am);                                 // fx*am

    __m128 XX = _sse_dot4_ps(fx,AM);                                 // fx*AM

    __m128 cc = _mm_sub_ps(_mm_mul_ps(xx,XP),_mm_mul_ps(xp,XX));     // cc=xx*XP-xp*XX

    __m128 ss = _mm_add_ps(_mm_mul_ps(xx,xp),_mm_mul_ps(XX,XP));     // ss=xx*xp+XX*XP

           cc = _mm_andnot_ps(sm,_mm_mul_ps(_mm_mul_ps(cc,ss),_2));  // cc=|2*cc*ss| (chk-ed)

           xp = _mm_add_ps(_mm_mul_ps(xp,xp),_mm_mul_ps(XP,XP));     // xp=xp*xp+XP*XP

           xx = _mm_add_ps(_mm_mul_ps(xx,xx),_mm_mul_ps(XX,XX));     // xx=xx*xx+XX*XX

    __m128 gp = _sse_dot4_ps(fp,fp);                                 // fp*fp

    __m128 gx = _mm_add_ps(_sse_dot4_ps(fx,fx),_o);                  // fx*fx + epsilon

           XX = _mm_add_ps(_mm_mul_ps(xx,gp),_mm_mul_ps(xp,gx));     // XX=xx*gp+xp*gx

           ss = _mm_add_ps(_sse_dot4_ps(am,am),_sse_dot4_ps(AM,AM)); // total energy

           ss = _mm_add_ps(ss,_D);                                   // total energy + delta

           cc = _mm_div_ps(cc,_mm_add_ps(_mm_mul_ps(xx,ss),_o));     // second psi term - added

           ss = _mm_div_ps(xp,_mm_add_ps(_mm_mul_ps(gp,ss),_o));     // first psi term - subtracted

           ss = _mm_mul_ps(_G,_mm_add_ps(_mm_sub_ps(_1,ss),cc));     // G*sin(psi)^2

           cc = _mm_div_ps(_2,_mm_add_ps(gp,gx));                    // cc=2*(1-T)/(gp+gx)

           cc = _mm_mul_ps(_mm_mul_ps(cc,cc),_mm_mul_ps(gp,gx));     // right is ready for comparison

           //cc = _mm_sqrt_ps(cc);

           //ss = _mm_sqrt_ps(ss);

           //cc = _mm_div_ps(cc,_mm_add_ps(ss,cc));

           cc = _mm_and_ps(_mm_cmpge_ps(cc,ss),_1);                  // 1 if cc>ss or 0 if cc<ss

           XX = _mm_div_ps(XX,_mm_mul_ps(_mm_add_ps(gp,_D),gx));     // std L with Tikhonov regulator

    return XX;                                        // final projected energy

    return _mm_mul_ps(XX,cc);                                        // final projected energy

 }


 static inline __m128 _sse_asy4_ps(__m128* _fp, __m128* _fx, __m128* _aa, __m128* _AA) {

 // calculate sample network asymmetry

 // _a - alignment factors

 // _e - ellipticity factors

 // _c - cos between f+ and 00 projection

 // return network asymmetry

    static const __m128 sm = _mm_set1_ps(-0.f);                       // sign mask: -0.f = 1 << 31

    static const __m128 _1 = _mm_set1_ps(1.);

    static const __m128 _2 = _mm_set1_ps(2.);

    static const __m128 _o = _mm_set1_ps(1.e-4);

    __m128 _a = _sse_dot4_ps(_fp,_fp);                                // |fp|^2

    __m128 _e = _sse_dot4_ps(_aa,_aa);                                // |x00|^2

    __m128 _c = _sse_dot4_ps(_fp,_aa);                                // (fp,aa)

           _c = _mm_div_ps(_mm_mul_ps(_c,_c),_mm_mul_ps(_a,_e));      // cos^2(psi)

           _a = _mm_add_ps(_mm_div_ps(_sse_dot4_ps(_fx,_fx),_a),_o);  // a = (|fx|/|fp|)^2

           _e = _mm_div_ps(_sse_dot4_ps(_AA,_AA),_mm_mul_ps(_a,_e));  // (|x90|/|x00|)^2/a

           _e = _mm_div_ps(_1,_mm_add_ps(_1,_mm_sqrt_ps(_e)));        // ee = 1/(1+sqrt(ee))

           _c = _mm_mul_ps(_mm_sub_ps(_1,_c),_mm_add_ps(_1,_a));      // _c = s*s*(1+a)

           _a = _mm_div_ps(_c,_mm_mul_ps(_2,_a));                     // _a = s*s*(1+a)/(2*a)

           _a = _mm_div_ps(_1,_mm_add_ps(_1,_mm_sqrt_ps(_a)));        // 1/(1+sqrt(aa))

           _a = _mm_div_ps(_mm_add_ps(_a,_e),_2);                     // (aa+ee)/2

           _e = _mm_andnot_ps(sm,_mm_sub_ps(_a,_e));                  // |aa-ee|/2

    return _mm_sub_ps(_a,_e);                                         // (aa-ee)/2-|aa-ee|/2

 }


 static inline float _sse_snc4_ps(__m128* _pe, float* _ne, float* out, int V4) {

 // sub net cut (snc)

 // _pe - pointer to energy vectors

 // _rE - pointer to network energy array

 // out - output array

 //  V4 - number of iterations (pixels)

    __m128 _Etot = _mm_setzero_ps();             // total energy

    __m128 _Esub = _mm_setzero_ps();             // energy after subnet cut

    __m128* _rE  = (__m128*) ne;                 // m128 pointer to energy array


          for(j=0; j<V4; j+=4) {                       // loop over selected pixels

             *_rE = _net_sum_ps(_pe);


             __m128 _cmp = _mm_cmpge_ps(*_rE,_En);     // E>En

             __m128 _msk = _mm_and_ps(_cmp,_one);      // 0/1 mask


             *_rE = _mm_mul_ps(*_rE,_msk);             // zero sub-threshold pixels

            _Etot = _mm_add_ps(_Etot,*_rE);


             _cmp = _net_cut_ps(_rE, _pe, _Es, _cmp);  // apply subnetwork cut


             _msk  = _mm_and_ps(_cmp,_one);            // 0/1 mask

             _Esub = _mm_add_ps(_Esub,_mm_mul_ps(*_rE++,_msk));

             *_pE++ = _mm_setzero_ps();

          }


          _mm_storeu_ps(etot,_Etot);

 }

 */

 /*

     gx = NET.dotx(Fx,Fx)+1.e-24;

     gI = NET.dotx(Fp,Fx);

     xp = NET.dotx(Fp,am);

     xx = NET.dotx(Fx,am);

     XP = NET.dotx(Fp,AM);

     XX = NET.dotx(Fx,AM);


 // find weak vector 00


     uc = (xp*gx - xx*gI);                 // u cos of rotation to PCF

     us = (xx*gp - xp*gI);                 // u sin of rotation to PCF

     um = NET.rotx(Fp,uc,Fx,us,u);         // calculate u and return its norm

     et = NET.dotx(am,am);

     hh = NET.dotx(am,u,e);

     ec = (hh*hh - NET.dotx(e,e))/um;


     if(nn==0) SM.set(n,0.1*hh*hh/NET.dotx(e,e));

     else      SM.add(n,0.1*hh*hh/NET.dotx(e,e));


          NET.rotx(u,hh/um,e,0.,aa);       // normalize aa

 //    ec = NET.dotx(aa,aa);


 // find weak vector 90


     uc = (XP*gx - XX*gI);                 // u cos of rotation to PCF

     us = (XX*gp - XP*gI);                 // u sin of rotation to PCF

     UM = NET.rotx(Fp,uc,Fx,us,U);        // calculate u and return its norm

     ET = NET.dotx(AM,AM);

     HH = NET.dotx(AM,U,e);

     EC = (HH*HH - NET.dotx(e,e))/UM;

          NET.rotx(U,HH/UM,e,0.,AA);        // normalize AA

 //    EC = NET.dotx(AA,AA);

 //    g2->Fill((ec+EC)/(et+ET));


 //  transformation to DDF


     gp = NET.dotx(aa,aa)+1.e-24;          // fp^2

     gx = NET.dotx(AA,AA)+1.e-24;          // fx^2

     gI = NET.dotx(aa,AA);                 // fp*fx

     gR = (gp-gx)/2.;

     gr = (gp+gx)/2.;

     gc = sqrt(gR*gR+gI*gI);               // norm of complex antenna pattern

      b = (gr-gc)/(gr+gc);


 //    g0->Fill(sqrt(b));


     cc = sqrt((gc+gR)*(gc+gR)+gI*gI);

     ss = NET.rotx(aa,(gc+gR)/cc,AA,gI/cc,s);   // s[k]

     xx = NET.rotx(am,(gc+gR)/cc,AM,gI/cc,x);   // x[k]

     cc = sqrt((gc-gR)*(gc-gR)+gI*gI);

     SS = NET.rotx(aa,-(gc-gR)/cc,AA,gI/cc,S)+1.e-24; // S[k]

     XX = NET.rotx(am,-(gc-gR)/cc,AM,gI/cc,X)+1.e-24; // X[k]


 //    cout<<ss<<" "<<NET.dotx(x,s)*NET.dotx(x,s)/ss<<endl;


 // Principle components


     hh = NET.dotx(x,s,e);

     ec = (hh*hh - NET.dotx(e,e))/ss;

     HH = NET.dotx(X,S,e);

     EC = (HH*HH - NET.dotx(e,e))/SS;


 */


 #endif // WATSSE_HH


_sse_abs_ps
static float _sse_abs_ps(__m128 *_a)
Definition: watsse.hh:119

_sse_dot_ps
static float _sse_dot_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:195

cwb_online.f
tuple f
Definition: cwb_online.py:91

printf
printf("total live time: non-zero lags = %10.1f \n", liveTot)

_sse_hard4_ps
static void _sse_hard4_ps(__m128 *_uu, __m128 *_am, __m128 *_AM, __m128 _c)
Definition: watsse.hh:66

_sse_rnorm4_ps
static __m128 _sse_rnorm4_ps(__m128 *_p)
Definition: watsse.hh:171

_sse_reg4x_ps
static __m128 _sse_reg4x_ps(__m128 _L, __m128 *fx, __m128 *am, __m128 *AM, __m128 _D)
Definition: watsse.hh:1040

a
wavearray< double > a(hp.size())

v
WSeries< float > v[nIFO]
Definition: cwb_net.C:62

_sse_add4_ps
static void _sse_add4_ps(__m128 *_a, __m128 *_b, __m128 _c)
Definition: watsse.hh:244

n
int n
Definition: cwb_net.C:10

_sse_abs4_ps
static __m128 _sse_abs4_ps(__m128 *_p)
Definition: watsse.hh:139

_sse_zero_ps
static void _sse_zero_ps(__m128 *_p)
Definition: watsse.hh:26

_sse_ed4_ps
static __m128 _sse_ed4_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:855

wat.hh

_sse_dsp4_ps
static void _sse_dsp4_ps(__m128 *u, __m128 *v, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
Definition: watsse.hh:644

_sse_ei4xu_ps
static __m128 _sse_ei4xu_ps(__m128 *_x, __m128 *_u, __m128 _L)
Definition: watsse.hh:716

_sse_dot4_ps
static __m128 _sse_dot4_ps(__m128 *_p, __m128 *_q)
Definition: watsse.hh:205

x
cout<< endl;cout<< "ts size = "<< ts.size()<< " ts rate = "<< ts.rate()<< endl;tf.Forward(ts, wdm);int levels=tf.getLevel();cout<< "tf size = "<< tf.size()<< endl;double dF=tf.resolution();double dT=1./(2 *dF);cout<< "rate(hz) : "<< RATE<< "\t layers : "<< nLAYERS<< "\t dF(hz) : "<< dF<< "\t dT(ms) : "<< dT *1000.<< endl;int itime=TIME_PIXEL_INDEX;int ifreq=FREQ_PIXEL_INDEX;int index=(levels+1)*itime+ifreq;double time=itime *dT;double freq=(ifreq >0)?ifreq *dF:dF/4;cout<< endl;cout<< "PIXEL TIME = "<< time<< " sec "<< endl;cout<< "PIXEL FREQ = "<< freq<< " Hz "<< endl;cout<< endl;wavearray< double > x
Definition: BaseFunctionWDM.C:51

_sse_cpf4_ps
static void _sse_cpf4_ps(__m128 *_aa, __m128 *_pp)
Definition: watsse.hh:299

_sse_rotsub_ps
static float _sse_rotsub_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:381

_sse_nind4_ps
static __m128 _sse_nind4_ps(__m128 *_am, __m128 *_AM)
Definition: watsse.hh:1056

_NET
#define _NET(P1, P2)
Definition: wat.hh:64

sm
sm
Definition: DrawSkymapWithSkyplot.C:4

_sse_cpf_ps
static void _sse_cpf_ps(float *a, __m128 *_p)
Definition: watsse.hh:289

m
int m
Definition: cwb_net.C:10

_sse_rotm_ps
static void _sse_rotm_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
Definition: watsse.hh:406

_sse_rotadd_ps
static void _sse_rotadd_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:370

ss
cout<< "Selected Pixels : "<< nPix<< endl;wc.cluster(1, 1);SSeries< double > ss
Definition: SSeriesExample.C:115

out
ofstream out
Definition: cwb_merge.C:196

_sse_nrg_ps
static float _sse_nrg_ps(__m128 *_u, float c, __m128 *_v, float s, __m128 *_a)
Definition: watsse.hh:354

xx
wavearray< double > xx
Definition: TestFrame1.C:11

ReadFileWAVE.c
tuple c
Definition: ReadFileWAVE.py:43

_sse_ecoh4_ps
static __m128 _sse_ecoh4_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:760

gx
gwavearray< double > * gx
Definition: CompareWDMvsTime.C:18

_sse_ifcp4_ps
static void _sse_ifcp4_ps(__m128 *_aa, __m128 *_bb, __m128 _c)
Definition: watsse.hh:98

_sse_ind4_ps
static __m128 _sse_ind4_ps(__m128 *_p, __m128 _L)
Definition: watsse.hh:791

_sse_pol4_ps
static void _sse_pol4_ps(__m128 *_fp, __m128 *_fx, __m128 *_v, double *r, double *a)
Definition: watsse.hh:1068

_sse_like4w_ps
static __m128 _sse_like4w_ps(__m128 *fp, __m128 *fx, __m128 *am, __m128 *AM)
Definition: watsse.hh:1021

_sse_rot4m_ps
static void _sse_rot4m_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
Definition: watsse.hh:454

_sse_load_ps
static void _sse_load_ps(__m128 *_p, float a)
Definition: watsse.hh:32

e
double e
Definition: CreateListEBBH.C:35

r
regression r
Definition: Regression_H1.C:44

s
s s
Definition: cwb_net.C:137

_sse_add_ps
static void _sse_add_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:230

_sse_mul4_ps
static void _sse_mul4_ps(__m128 *_am, __m128 _c)
Definition: watsse.hh:57

_sse_ei4_ps
static __m128 _sse_ei4_ps(__m128 *_u, __m128 _L)
Definition: watsse.hh:660

_sse_print_ps
static void _sse_print_ps(__m128 *_p)
Definition: watsse.hh:19

_sse_ed4i_ps
static __m128 _sse_ed4i_ps(__m128 *_p, __m128 *_q, __m128 _L)
Definition: watsse.hh:934

l
int l
Definition: cbc_plots.C:434

_sse_sum_ps
static __m128 _sse_sum_ps(__m128 **_p)
Definition: watsse.hh:497

_sse_point_ps
static void _sse_point_ps(__m128 **_p, float **p, short **m, int l, int n)
Definition: watsse.hh:484

_sse_mul_ps
static void _sse_mul_ps(__m128 *_a, float b)
Definition: watsse.hh:38

_sse_div4_ps
static __m128 _sse_div4_ps(__m128 *_v, __m128 *_u)
Definition: watsse.hh:163

_sse_null4_ps
static __m128 _sse_null4_ps(__m128 *_p, __m128 *_q)
Definition: watsse.hh:745

_sse_maxE_ps
static float _sse_maxE_ps(__m128 *_a, __m128 *_A)
Definition: watsse.hh:536

_sse_minSNE_ps
static void _sse_minSNE_ps(__m128 *_pE, __m128 **_pe, __m128 *_es)
Definition: watsse.hh:522

_sse_like4_ps
static __m128 _sse_like4_ps(__m128 *_f, __m128 *_a, __m128 *_A)
Definition: watsse.hh:980

_sse_rot4p_ps
static void _sse_rot4p_ps(__m128 *_u, __m128 *_c, __m128 *_v, __m128 *_s, __m128 *_a)
Definition: watsse.hh:424

_sse_rotp_ps
static void _sse_rotp_ps(__m128 *u, float *c, __m128 *v, float *s, __m128 *a)
Definition: watsse.hh:398

_sse_sub4_ps
static void _sse_sub4_ps(__m128 *_a, __m128 *_b, __m128 _c)
Definition: watsse.hh:270

_sse_dpf4_ps
static void _sse_dpf4_ps(__m128 *_Fp, __m128 *_Fx, __m128 *_fp, __m128 *_fx)
Definition: watsse.hh:613

_sse_cut_ps
static __m128 _sse_cut_ps(__m128 *_pE, __m128 **_pe, __m128 _Es, __m128 _cmp)
Definition: watsse.hh:510

_sse_pnp4_ps
static void _sse_pnp4_ps(__m128 *_fp, __m128 *_fx, __m128 *_am, __m128 *_AM, __m128 *_u, __m128 *_v)
Definition: watsse.hh:623

_sse_ei4xx_ps
static __m128 _sse_ei4xx_ps(__m128 *_x, __m128 *_u, __m128 _L)
Definition: watsse.hh:687

_sse_ort4_ps
static void _sse_ort4_ps(__m128 *_u, __m128 *_v, __m128 *_s, __m128 *_c)
Definition: watsse.hh:554

_sse_sub_ps
static void _sse_sub_ps(__m128 *_a, __m128 *_b)
Definition: watsse.hh:263