GNU Radio C++ API
volk_32fc_x2_conjugate_dot_prod_32fc_u.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
3 
4 
5 #include<volk/volk_complex.h>
6 
7 
8 #ifdef LV_HAVE_GENERIC
9 
10 
11 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
12 
13  float * res = (float*) result;
14  float * in = (float*) input;
15  float * tp = (float*) taps;
16  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
17  unsigned int isodd = (num_bytes >> 3) &1;
18 
19 
20 
21  float sum0[2] = {0,0};
22  float sum1[2] = {0,0};
23  unsigned int i = 0;
24 
25 
26  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
27 
28  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
29  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
30  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
31  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
32 
33 
34  in += 4;
35  tp += 4;
36 
37  }
38 
39 
40  res[0] = sum0[0] + sum1[0];
41  res[1] = sum0[1] + sum1[1];
42 
43 
44 
45  for(i = 0; i < isodd; ++i) {
46 
47 
48  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
49 
50  }
51  /*
52  for(i = 0; i < num_bytes >> 3; ++i) {
53  *result += input[i] * conjf(taps[i]);
54  }
55  */
56 }
57 
58 #endif /*LV_HAVE_GENERIC*/
59 
60 #ifdef LV_HAVE_SSE3
61 
62 #include <xmmintrin.h>
63 #include <pmmintrin.h>
64 #include <mmintrin.h>
65 
66 
67 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
68 
69  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
70 
71  union HalfMask {
72  uint32_t intRep[4];
73  __m128 vec;
74  } halfMask;
75 
76  union NegMask {
77  int intRep[4];
78  __m128 vec;
79  } negMask;
80 
81  unsigned int offset = 0;
82  float Rsum=0, Isum=0;
83  float Im,Re;
84 
85  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
86  __m128 zv = {0,0,0,0};
87 
88  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
89  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
90 
91  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
92  negMask.intRep[1] = negMask.intRep[3] = 0;
93 
94  // main loop
95  while(num_bytes >= 4*sizeof(float)){
96 
97  in1 = _mm_loadu_ps( (float*) (input+offset) );
98  in2 = _mm_loadu_ps( (float*) (taps+offset) );
99  Rv = in1*in2;
100  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
101  Iv = in1*fehg;
102  Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
103  Ivm = _mm_xor_ps( negMask.vec, Iv );
104  Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
105  _mm_store_ss( &Im, Is );
106  _mm_store_ss( &Re, Rs );
107  num_bytes -= 4*sizeof(float);
108  offset += 2;
109  Rsum += Re;
110  Isum += Im;
111  }
112 
113  // handle the last complex case ...
114  if(num_bytes > 0){
115 
116  if(num_bytes != 4){
117  // bad things are happening
118  }
119 
120  in1 = _mm_loadu_ps( (float*) (input+offset) );
121  in2 = _mm_loadu_ps( (float*) (taps+offset) );
122  Rv = _mm_and_ps(in1*in2, halfMask.vec);
123  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
124  Iv = _mm_and_ps(in1*fehg, halfMask.vec);
125  Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
126  Ivm = _mm_xor_ps( negMask.vec, Iv );
127  Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
128  _mm_store_ss( &Im, Is );
129  _mm_store_ss( &Re, Rs );
130  Rsum += Re;
131  Isum += Im;
132  }
133 
134  result[0] = lv_cmake(Rsum,Isum);
135  return;
136 }
137 
138 #endif /*LV_HAVE_SSE3*/
139 
140 
141 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
142 
143 
144